com.nridge.core.base.field.data.DataFieldAnalyzer.java Source code

Introduction

Here is the source code for com.nridge.core.base.field.data.DataFieldAnalyzer.java
Source

/*
 * NorthRidge Software, LLC - Copyright (c) 2019.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package com.nridge.core.base.field.data;

import com.nridge.core.base.field.Field;
import com.nridge.core.base.std.DatUtl;
import com.nridge.core.base.std.StrUtl;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;

import java.util.Date;
import java.util.HashMap;
import java.util.Map;

/**
 * The Data Field Analyzer class will examine small-to-medium sized
 * data sets to determine their type and value composition.
 *
 * A good enhancement to this package would be the inclusion of the
 * Apache Commons Math features - see link below.
 *
 * @see <a href="http://commons.apache.org/proper/commons-math/userguide/stat.html">Apache Commons Math</a>
 *
 * @author Al Cole
 * @since 1.0
 */
public class DataFieldAnalyzer {
    private String mName;
    private int mNullCount;
    private boolean mIsDate;
    private boolean mIsFloat;
    private int mTotalValues;
    private boolean mIsInteger;
    private boolean mIsBoolean;
    private Map<String, Integer> mValueCount;

    /**
     * Constructor with a unique field name.
     *
     * @param aName Field name.
     */
    public DataFieldAnalyzer(String aName) {
        reset(aName);
    }

    /**
     * Use this if you wish to reuse the object instance.
     *
     * @param aName Field name.
     */
    public void reset(String aName) {
        mName = aName;
        mNullCount = 0;
        mIsDate = true;
        mIsFloat = true;
        mTotalValues = 0;
        mIsInteger = true;
        mIsBoolean = true;
        mValueCount = new HashMap<>();
    }

    /**
     * Convenience method for the DataAnalyzer class to generate a
     * table definition instance.
     *
     * @param aSampleCount Sample of top counts of data values.
     *
     * @return DataBag instance.
     */
    public DataBag createDefinition(int aSampleCount) {
        String fieldName, fieldTitle;

        DataBag detailsBag = new DataBag(mName);
        detailsBag.add(new DataTextField("name", "Name"));
        detailsBag.add(new DataTextField("type", "Type"));
        detailsBag.add(new DataIntegerField("total_count", "Total Count"));
        detailsBag.add(new DataIntegerField("null_count", "Null Count"));
        detailsBag.add(new DataIntegerField("unique_count", "Unique Count"));
        detailsBag.add(new DataTextField("minimum", "Minimum"));
        detailsBag.add(new DataTextField("maximum", "Maximum"));
        detailsBag.add(new DataTextField("mean", "Mean"));
        detailsBag.add(new DataTextField("standard_deviation", "Deviation"));
        for (int col = 0; col < aSampleCount; col++) {
            fieldName = String.format("value_%02d", col + 1);
            fieldTitle = String.format("Value %02d", col + 1);
            detailsBag.add(new DataTextField(fieldName, fieldTitle));
            fieldName = String.format("count_%02d", col + 1);
            fieldTitle = String.format("Count %02d", col + 1);
            detailsBag.add(new DataIntegerField(fieldName, fieldTitle));
            fieldName = String.format("percent_%02d", col + 1);
            fieldTitle = String.format("Percent %02d", col + 1);
            detailsBag.add(new DataTextField(fieldName, fieldTitle));
        }

        return detailsBag;
    }

    private boolean isNumberType() {
        return mIsFloat || mIsInteger;
    }

    private void scanType(String aValue) {
        if (isNumberType()) {
            if (NumberUtils.isParsable(aValue)) {
                int offset = aValue.indexOf(StrUtl.CHAR_DOT);
                if ((mIsInteger) && (offset != -1))
                    mIsInteger = false;
            } else {
                if (mIsInteger)
                    mIsInteger = false;
                if (mIsFloat)
                    mIsFloat = false;
            }
        }
        if (mIsDate) {
            Date fieldDate = DatUtl.detectCreateDate(aValue);
            if (fieldDate == null)
                mIsDate = false;
        }
        if (mIsBoolean) {
            if ((!aValue.equalsIgnoreCase(StrUtl.STRING_TRUE)) && (!aValue.equalsIgnoreCase(StrUtl.STRING_YES))
                    && (!aValue.equalsIgnoreCase(StrUtl.STRING_FALSE))
                    && (!aValue.equalsIgnoreCase(StrUtl.STRING_NO)))
                mIsBoolean = false;
        }
    }

    /**
     * Scans the data value to determine its type and metric information.
     *
     * @param aValue Data value.
     */
    public void scan(String aValue) {
        mTotalValues++;
        if (StringUtils.isNotEmpty(aValue)) {
            scanType(aValue);
            Integer curCount = mValueCount.get(aValue);
            if (curCount == null)
                mValueCount.put(aValue, 1);
            else
                mValueCount.put(aValue, curCount + 1);
        } else
            mNullCount++;
    }

    /**
     * Returns the derived type information once the scanning process
     * is complete.
     *
     * @return Field type.
     */
    public Field.Type getType() {
        if (mIsBoolean)
            return Field.Type.Boolean;
        else if (mIsInteger)
            return Field.Type.Integer;
        else if (mIsFloat)
            return Field.Type.Float;
        else if (mIsDate)
            return Field.Type.DateTime;
        else
            return Field.Type.Text;
    }

    /**
     * Returns a data bag of fields describing the scanned value data.
     * The bag will contain the field name, derived type, populated
     * count, null count and a sample count of values (with overall
     * percentages) that repeated most often.
     *
     * @param aSampleCount Identifies the top count of values.
     *
     * @return Data bag of analysis details.
     */
    public DataBag getDetails(int aSampleCount) {
        Date dateValue;
        Integer valueCount;
        String fieldName, fieldTitle, dataValue;
        Double valuePercentage, minValue, maxValue;

        Field.Type fieldType = getType();
        int uniqueValues = mValueCount.size();
        DataBag detailsBag = new DataBag(mName);
        detailsBag.add(new DataTextField("name", "Name", mName));
        detailsBag.add(new DataTextField("type", "Type", Field.typeToString(fieldType)));
        detailsBag.add(new DataIntegerField("total_count", "Total Count", mTotalValues));
        detailsBag.add(new DataIntegerField("null_count", "Null Count", mNullCount));
        detailsBag.add(new DataIntegerField("unique_count", "Unique Count", uniqueValues));

        // Create a table from the values map and use sorting to get our top sample size.

        DataTable valuesTable = new DataTable(mName);
        valuesTable.add(new DataTextField("value", "Value"));
        valuesTable.add(new DataIntegerField("count", "Count"));
        valuesTable.add(new DataDoubleField("percentage", "Percentage"));

        minValue = Double.MAX_VALUE;
        maxValue = Double.MIN_VALUE;
        for (Map.Entry<String, Integer> entry : mValueCount.entrySet()) {
            valuesTable.newRow();
            dataValue = entry.getKey();
            valueCount = entry.getValue();
            if (mTotalValues == 0)
                valuePercentage = 0.0;
            else
                valuePercentage = valueCount.doubleValue() / mTotalValues * 100.0;

            valuesTable.newRow();
            valuesTable.setValueByName("value", dataValue);
            valuesTable.setValueByName("count", valueCount);
            valuesTable.setValueByName("percentage", String.format("%.2f", valuePercentage));
            if (Field.isText(fieldType)) {
                minValue = Math.min(minValue, dataValue.length());
                maxValue = Math.max(maxValue, dataValue.length());
            } else if (Field.isNumber(fieldType)) {
                minValue = Math.min(minValue, Double.parseDouble(dataValue));
                maxValue = Math.max(maxValue, Double.parseDouble(dataValue));
            } else if (Field.isDateOrTime(fieldType)) {

                // While we are decomposing the date to milliseconds of time, you can do a Date(milliseconds)
                // reconstruction.

                dateValue = DatUtl.detectCreateDate(dataValue);
                if (dataValue != null) {
                    minValue = Math.min(minValue, dateValue.getTime());
                    maxValue = Math.max(maxValue, dateValue.getTime());
                }
            }
            valuesTable.addRow();
        }
        valuesTable.sortByColumn("count", Field.Order.DESCENDING);

        if (Field.isBoolean(fieldType)) {
            detailsBag.add(new DataTextField("minimum", "Minimum", StrUtl.STRING_FALSE));
            detailsBag.add(new DataTextField("maximum", "Maximum", StrUtl.STRING_TRUE));
        } else if (Field.isDateOrTime(fieldType)) {
            detailsBag.add(new DataTextField("minimum", "Minimum",
                    Field.dateValueFormatted(new Date(minValue.longValue()), Field.FORMAT_DATETIME_DEFAULT)));
            detailsBag.add(new DataTextField("maximum", "Maximum",
                    Field.dateValueFormatted(new Date(maxValue.longValue()), Field.FORMAT_DATETIME_DEFAULT)));
        } else {
            detailsBag.add(new DataTextField("minimum", "Minimum", String.format("%.2f", minValue)));
            detailsBag.add(new DataTextField("maximum", "Maximum", String.format("%.2f", maxValue)));
        }

        // Create columns for the top sample sizes (value, matching count, matching percentage)

        int adjCount = Math.min(aSampleCount, valuesTable.rowCount());
        for (int row = 0; row < adjCount; row++) {
            fieldName = String.format("value_%02d", row + 1);
            fieldTitle = String.format("Value %02d", row + 1);
            dataValue = valuesTable.getValueByName(row, "value");
            detailsBag.add(new DataTextField(fieldName, fieldTitle, dataValue));
            fieldName = String.format("count_%02d", row + 1);
            fieldTitle = String.format("Count %02d", row + 1);
            detailsBag.add(new DataIntegerField(fieldName, fieldTitle, valuesTable.getValueByName(row, "count")));
            fieldName = String.format("percent_%02d", row + 1);
            fieldTitle = String.format("Percent %02d", row + 1);
            detailsBag
                    .add(new DataDoubleField(fieldName, fieldTitle, valuesTable.getValueByName(row, "percentage")));
        }

        return detailsBag;
    }
}