org.apache.hadoop.hive.ql.stats.StatsUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hive.ql.stats.StatsUtils.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.stats;

import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.google.common.math.LongMath;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.metastore.api.AggrStats;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.Decimal;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.plan.ColStatistics;
import org.apache.hadoop.hive.ql.plan.ColStatistics.Range;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnListDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.Statistics;
import org.apache.hadoop.hive.ql.plan.Statistics.State;
import org.apache.hadoop.hive.ql.util.JavaDataModel;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantMapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBinaryObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBooleanObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantBinaryObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantHiveCharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantHiveVarcharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantStringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDateObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableFloatObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveCharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveDecimalObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveVarcharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableLongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableShortObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector;
import org.apache.hadoop.io.BytesWritable;
import org.apache.tez.mapreduce.hadoop.MRJobConfig;

import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

public class StatsUtils {

    private static final Log LOG = LogFactory.getLog(StatsUtils.class.getName());

    /**
     * Collect table, partition and column level statistics
     * @param conf
     *          - hive configuration
     * @param partList
     *          - partition list
     * @param table
     *          - table
     * @param tableScanOperator
     *          - table scan operator
     * @return statistics object
     * @throws HiveException
     */
    public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table,
            TableScanOperator tableScanOperator) throws HiveException {

        // column level statistics are required only for the columns that are needed
        List<ColumnInfo> schema = tableScanOperator.getSchema().getSignature();
        List<String> neededColumns = tableScanOperator.getNeededColumns();
        List<String> referencedColumns = tableScanOperator.getReferencedColumns();

        return collectStatistics(conf, partList, table, schema, neededColumns, referencedColumns);
    }

    private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table,
            List<ColumnInfo> schema, List<String> neededColumns, List<String> referencedColumns)
            throws HiveException {

        boolean fetchColStats = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS);
        boolean fetchPartStats = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_PARTITION_STATS);

        return collectStatistics(conf, partList, table, schema, neededColumns, referencedColumns, fetchColStats,
                fetchPartStats);
    }

    public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table,
            List<ColumnInfo> schema, List<String> neededColumns, List<String> referencedColumns,
            boolean fetchColStats, boolean fetchPartStats) throws HiveException {

        Statistics stats = new Statistics();

        float deserFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR);

        if (!table.isPartitioned()) {
            long nr = getNumRows(table);
            long ds = getRawDataSize(table);
            if (ds <= 0) {
                ds = getTotalSize(table);

                // if data size is still 0 then get file size
                if (ds <= 0) {
                    ds = getFileSizeForTable(conf, table);
                }

                ds = (long) (ds * deserFactor);
            }

            // number of rows -1 means that statistics from metastore is not reliable
            // and 0 means statistics gathering is disabled
            if (nr <= 0) {
                int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
                if (avgRowSize > 0) {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Estimated average row size: " + avgRowSize);
                    }
                    nr = ds / avgRowSize;
                }
            }
            if (nr == 0) {
                nr = 1;
            }
            stats.setNumRows(nr);
            stats.setDataSize(ds);

            List<ColStatistics> colStats = Lists.newArrayList();
            if (fetchColStats) {
                colStats = getTableColumnStats(table, schema, neededColumns);
            }

            // infer if any column can be primary key based on column statistics
            inferAndSetPrimaryKey(stats.getNumRows(), colStats);

            stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
            stats.addToColumnStats(colStats);
        } else if (partList != null) {
            // For partitioned tables, get the size of all the partitions after pruning
            // the partitions that are not required
            long nr = 0;
            long ds = 0;

            List<Long> rowCounts = Lists.newArrayList();
            List<Long> dataSizes = Lists.newArrayList();

            if (fetchPartStats) {
                rowCounts = getBasicStatForPartitions(table, partList.getNotDeniedPartns(),
                        StatsSetupConst.ROW_COUNT);
                dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(),
                        StatsSetupConst.RAW_DATA_SIZE);

                nr = getSumIgnoreNegatives(rowCounts);
                ds = getSumIgnoreNegatives(dataSizes);
                if (ds <= 0) {
                    dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(),
                            StatsSetupConst.TOTAL_SIZE);
                    ds = getSumIgnoreNegatives(dataSizes);
                }
            }

            // if data size still could not be determined, then fall back to filesytem to get file
            // sizes
            if (ds <= 0) {
                dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns());
            }
            ds = getSumIgnoreNegatives(dataSizes);
            ds = (long) (ds * deserFactor);

            int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
            if (avgRowSize > 0) {
                setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize);
                nr = getSumIgnoreNegatives(rowCounts);
                ds = getSumIgnoreNegatives(dataSizes);

                // number of rows -1 means that statistics from metastore is not reliable
                if (nr <= 0) {
                    nr = ds / avgRowSize;
                }
            }
            if (nr == 0) {
                nr = 1;
            }
            stats.addToNumRows(nr);
            stats.addToDataSize(ds);

            // if at least a partition does not contain row count then mark basic stats state as PARTIAL
            if (containsNonPositives(rowCounts) && stats.getBasicStatsState().equals(State.COMPLETE)) {
                stats.setBasicStatsState(State.PARTIAL);
            }
            if (fetchColStats) {
                List<String> partNames = new ArrayList<String>(partList.getNotDeniedPartns().size());
                for (Partition part : partList.getNotDeniedPartns()) {
                    partNames.add(part.getName());
                }
                neededColumns = processNeededColumns(schema, neededColumns);
                AggrStats aggrStats = null;
                // We check the sizes of neededColumns and partNames here. If either
                // size is 0, aggrStats is null after several retries. Thus, we can
                // skip the step to connect to the metastore.
                if (neededColumns.size() > 0 && partNames.size() > 0) {
                    aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(),
                            neededColumns, partNames);
                }
                if (null == aggrStats || null == aggrStats.getColStats() || aggrStats.getColStatsSize() == 0) {
                    // There are some partitions with no state (or we didn't fetch any state).
                    // Update the stats with empty list to reflect that in the
                    // state/initialize structures.
                    List<ColStatistics> emptyStats = Lists.newArrayList();

                    // add partition column stats
                    addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList,
                            emptyStats);

                    stats.addToColumnStats(emptyStats);
                    stats.updateColumnStatsState(deriveStatType(emptyStats, referencedColumns));
                } else {
                    List<ColumnStatisticsObj> colStats = aggrStats.getColStats();
                    if (colStats.size() != neededColumns.size()) {
                        LOG.debug("Column stats requested for : " + neededColumns.size() + " columns. Able to"
                                + " retrieve for " + colStats.size() + " columns");
                    }
                    List<ColStatistics> columnStats = convertColStats(colStats, table.getTableName());

                    addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList,
                            columnStats);

                    // infer if any column can be primary key based on column statistics
                    inferAndSetPrimaryKey(stats.getNumRows(), columnStats);

                    stats.addToColumnStats(columnStats);
                    State colState = deriveStatType(columnStats, referencedColumns);
                    if (aggrStats.getPartsFound() != partNames.size() && colState != State.NONE) {
                        LOG.debug("Column stats requested for : " + partNames.size() + " partitions. "
                                + "Able to retrieve for " + aggrStats.getPartsFound() + " partitions");
                        colState = State.PARTIAL;
                    }
                    stats.setColumnStatsState(colState);
                }
            }
        }
        return stats;
    }

    /**
     * Based on the provided column statistics and number of rows, this method infers if the column
     * can be primary key. It checks if the difference between the min and max value is equal to
     * number of rows specified.
     * @param numRows - number of rows
     * @param colStats - column statistics
     */
    public static void inferAndSetPrimaryKey(long numRows, List<ColStatistics> colStats) {
        if (colStats != null) {
            for (ColStatistics cs : colStats) {
                if (cs != null && cs.getCountDistint() >= numRows) {
                    cs.setPrimaryKey(true);
                } else if (cs != null && cs.getRange() != null && cs.getRange().minValue != null
                        && cs.getRange().maxValue != null) {
                    if (numRows == ((cs.getRange().maxValue.longValue() - cs.getRange().minValue.longValue())
                            + 1)) {
                        cs.setPrimaryKey(true);
                    }
                }
            }
        }
    }

    /**
     * Infer foreign key relationship from given column statistics.
     * @param csPK - column statistics of primary key
     * @param csFK - column statistics of potential foreign key
     * @return
     */
    public static boolean inferForeignKey(ColStatistics csPK, ColStatistics csFK) {
        if (csPK != null && csFK != null) {
            if (csPK.isPrimaryKey()) {
                if (csPK.getRange() != null && csFK.getRange() != null) {
                    ColStatistics.Range pkRange = csPK.getRange();
                    ColStatistics.Range fkRange = csFK.getRange();
                    return isWithin(fkRange, pkRange);
                }
            }
        }
        return false;
    }

    /**
     * Scale selectivity based on key range ratio.
     * @param csPK - column statistics of primary key
     * @param csFK - column statistics of potential foreign key
     * @return
     */
    public static float getScaledSelectivity(ColStatistics csPK, ColStatistics csFK) {
        float scaledSelectivity = 1.0f;
        if (csPK != null && csFK != null) {
            if (csPK.isPrimaryKey()) {
                // Use Max-Min Range as NDV gets scaled by selectivity.
                if (csPK.getRange() != null && csFK.getRange() != null) {
                    long pkRangeDelta = getRangeDelta(csPK.getRange());
                    long fkRangeDelta = getRangeDelta(csFK.getRange());
                    if (fkRangeDelta > 0 && pkRangeDelta > 0 && fkRangeDelta < pkRangeDelta) {
                        scaledSelectivity = (float) pkRangeDelta / (float) fkRangeDelta;
                    }
                }
            }
        }
        return scaledSelectivity;
    }

    private static long getRangeDelta(ColStatistics.Range range) {
        if (range.minValue != null && range.maxValue != null) {
            return (range.maxValue.longValue() - range.minValue.longValue());
        }
        return 0;
    }

    private static boolean isWithin(ColStatistics.Range range1, ColStatistics.Range range2) {
        if (range1.minValue != null && range2.minValue != null && range1.maxValue != null
                && range2.maxValue != null) {
            if (range1.minValue.longValue() >= range2.minValue.longValue()
                    && range1.maxValue.longValue() <= range2.maxValue.longValue()) {
                return true;
            }
        }
        return false;
    }

    private static void addParitionColumnStats(HiveConf conf, List<String> neededColumns,
            List<String> referencedColumns, List<ColumnInfo> schema, Table table, PrunedPartitionList partList,
            List<ColStatistics> colStats) throws HiveException {

        // extra columns is difference between referenced columns vs needed
        // columns. The difference could be partition columns.
        List<String> extraCols = Lists.newArrayList(referencedColumns);
        if (referencedColumns.size() > neededColumns.size()) {
            extraCols.removeAll(neededColumns);
            for (String col : extraCols) {
                for (ColumnInfo ci : schema) {
                    // conditions for being partition column
                    if (col.equals(ci.getInternalName()) && ci.getIsVirtualCol() && !ci.isHiddenVirtualCol()) {
                        // currently metastore does not store column stats for
                        // partition column, so we calculate the NDV from pruned
                        // partition list
                        ColStatistics partCS = new ColStatistics(ci.getInternalName(), ci.getType().getTypeName());
                        long numPartitions = getNDVPartitionColumn(partList.getPartitions(), ci.getInternalName());
                        partCS.setCountDistint(numPartitions);
                        partCS.setAvgColLen(StatsUtils.getAvgColLenOfVariableLengthTypes(conf,
                                ci.getObjectInspector(), partCS.getColumnType()));
                        partCS.setRange(getRangePartitionColumn(partList.getPartitions(), ci.getInternalName(),
                                ci.getType().getTypeName(), conf.getVar(ConfVars.DEFAULTPARTITIONNAME)));
                        colStats.add(partCS);
                    }
                }
            }
        }
    }

    public static int getNDVPartitionColumn(Set<Partition> partitions, String partColName) {
        Set<String> distinctVals = new HashSet<String>(partitions.size());
        for (Partition partition : partitions) {
            distinctVals.add(partition.getSpec().get(partColName));
        }
        return distinctVals.size();
    }

    private static Range getRangePartitionColumn(Set<Partition> partitions, String partColName, String colType,
            String defaultPartName) {
        Range range = null;
        String partVal;
        if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
                || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
                || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)
                || colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)) {
            long min = Long.MAX_VALUE;
            long max = Long.MIN_VALUE;
            for (Partition partition : partitions) {
                partVal = partition.getSpec().get(partColName);
                if (partVal.equals(defaultPartName)) {
                    // partition column value is null.
                    continue;
                } else {
                    long value = Long.parseLong(partVal);
                    min = Math.min(min, value);
                    max = Math.max(max, value);
                }
            }
            range = new Range(min, max);
        } else if (colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)
                || colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) {
            double min = Double.MAX_VALUE;
            double max = Double.MIN_VALUE;
            for (Partition partition : partitions) {
                partVal = partition.getSpec().get(partColName);
                if (partVal.equals(defaultPartName)) {
                    // partition column value is null.
                    continue;
                } else {
                    double value = Double.parseDouble(partVal);
                    min = Math.min(min, value);
                    max = Math.max(max, value);
                }
            }
            range = new Range(min, max);
        } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
            double min = Double.MAX_VALUE;
            double max = Double.MIN_VALUE;
            for (Partition partition : partitions) {
                partVal = partition.getSpec().get(partColName);
                if (partVal.equals(defaultPartName)) {
                    // partition column value is null.
                    continue;
                } else {
                    double value = new BigDecimal(partVal).doubleValue();
                    min = Math.min(min, value);
                    max = Math.max(max, value);
                }
            }
            range = new Range(min, max);
        } else {
            // Columns statistics for complex datatypes are not supported yet
            return null;
        }
        return range;
    }

    private static void setUnknownRcDsToAverage(List<Long> rowCounts, List<Long> dataSizes, int avgRowSize) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Estimated average row size: " + avgRowSize);
        }
        for (int i = 0; i < rowCounts.size(); i++) {
            long rc = rowCounts.get(i);
            long s = dataSizes.get(i);
            if (rc <= 0 && s > 0) {
                rc = s / avgRowSize;
                rowCounts.set(i, rc);
            }

            if (s <= 0 && rc > 0) {
                s = safeMult(rc, avgRowSize);
                dataSizes.set(i, s);
            }
        }
    }

    public static int estimateRowSizeFromSchema(HiveConf conf, List<ColumnInfo> schema,
            List<String> neededColumns) {
        int avgRowSize = 0;
        for (String neededCol : neededColumns) {
            ColumnInfo ci = getColumnInfoForColumn(neededCol, schema);
            if (ci == null) {
                // No need to collect statistics of index columns
                continue;
            }
            ObjectInspector oi = ci.getObjectInspector();
            String colType = ci.getTypeName();
            if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
                    || colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)
                    || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
                    || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)
                    || colType.startsWith(serdeConstants.LIST_TYPE_NAME)
                    || colType.startsWith(serdeConstants.MAP_TYPE_NAME)
                    || colType.startsWith(serdeConstants.STRUCT_TYPE_NAME)
                    || colType.startsWith(serdeConstants.UNION_TYPE_NAME)) {
                avgRowSize += getAvgColLenOfVariableLengthTypes(conf, oi, colType);
            } else {
                avgRowSize += getAvgColLenOfFixedLengthTypes(colType);
            }
        }
        return avgRowSize;
    }

    private static ColumnInfo getColumnInfoForColumn(String neededCol, List<ColumnInfo> schema) {
        for (ColumnInfo ci : schema) {
            if (ci.getInternalName().equalsIgnoreCase(neededCol)) {
                return ci;
            }
        }
        return null;
    }

    /**
     * Find the bytes on disk occupied by a table
     * @param conf
     *          - hive conf
     * @param table
     *          - table
     * @return size on disk
     */
    public static long getFileSizeForTable(HiveConf conf, Table table) {
        Path path = table.getPath();
        long size = 0;
        try {
            FileSystem fs = path.getFileSystem(conf);
            size = fs.getContentSummary(path).getLength();
        } catch (Exception e) {
            size = 0;
        }
        return size;
    }

    /**
     * Find the bytes on disks occupied by list of partitions
     * @param conf
     *          - hive conf
     * @param parts
     *          - partition list
     * @return sizes of patitions
     */
    public static List<Long> getFileSizeForPartitions(HiveConf conf, List<Partition> parts) {
        List<Long> sizes = Lists.newArrayList();
        for (Partition part : parts) {
            Path path = part.getDataLocation();
            long size = 0;
            try {
                FileSystem fs = path.getFileSystem(conf);
                size = fs.getContentSummary(path).getLength();
            } catch (Exception e) {
                size = 0;
            }
            sizes.add(size);
        }
        return sizes;
    }

    private static boolean containsNonPositives(List<Long> vals) {
        for (Long val : vals) {
            if (val <= 0L) {
                return true;
            }
        }
        return false;
    }

    /**
     * Get sum of all values in the list that are >0
     * @param vals
     *          - list of values
     * @return sum
     */
    public static long getSumIgnoreNegatives(List<Long> vals) {
        long result = 0;
        for (Long l : vals) {
            if (l > 0) {
                result = safeAdd(result, l);
            }
        }
        return result;
    }

    private static Statistics.State deriveStatType(List<ColStatistics> colStats, List<String> neededColumns) {
        boolean hasStats = false, hasNull = (colStats == null) || (colStats.size() < neededColumns.size());
        if (colStats != null) {
            for (ColStatistics cs : colStats) {
                boolean isNull = cs == null;
                hasStats |= !isNull;
                hasNull |= isNull;
                if (hasNull && hasStats)
                    break;
            }
        }
        State result = (hasStats ? (hasNull ? Statistics.State.PARTIAL : Statistics.State.COMPLETE)
                : (neededColumns.isEmpty() ? Statistics.State.COMPLETE : Statistics.State.NONE));
        return result;
    }

    /**
     * Convert ColumnStatisticsObj to ColStatistics
     * @param cso
     *          - ColumnStatisticsObj
     * @param tabName
     *          - table name
     * @param colName
     *          - column name
     * @return ColStatistics
     */
    public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String tabName, String colName) {
        ColStatistics cs = new ColStatistics(colName, cso.getColType());
        String colType = cso.getColType();
        ColumnStatisticsData csd = cso.getStatsData();
        if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
                || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
                || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)) {
            cs.setCountDistint(csd.getLongStats().getNumDVs());
            cs.setNumNulls(csd.getLongStats().getNumNulls());
            cs.setAvgColLen(JavaDataModel.get().primitive1());
            cs.setRange(csd.getLongStats().getLowValue(), csd.getLongStats().getHighValue());
        } else if (colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)) {
            cs.setCountDistint(csd.getLongStats().getNumDVs());
            cs.setNumNulls(csd.getLongStats().getNumNulls());
            cs.setAvgColLen(JavaDataModel.get().primitive2());
            cs.setRange(csd.getLongStats().getLowValue(), csd.getLongStats().getHighValue());
        } else if (colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)) {
            cs.setCountDistint(csd.getDoubleStats().getNumDVs());
            cs.setNumNulls(csd.getDoubleStats().getNumNulls());
            cs.setAvgColLen(JavaDataModel.get().primitive1());
            cs.setRange(csd.getDoubleStats().getLowValue(), csd.getDoubleStats().getHighValue());
        } else if (colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) {
            cs.setCountDistint(csd.getDoubleStats().getNumDVs());
            cs.setNumNulls(csd.getDoubleStats().getNumNulls());
            cs.setAvgColLen(JavaDataModel.get().primitive2());
            cs.setRange(csd.getDoubleStats().getLowValue(), csd.getDoubleStats().getHighValue());
        } else if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
                || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)
                || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)) {
            cs.setCountDistint(csd.getStringStats().getNumDVs());
            cs.setNumNulls(csd.getStringStats().getNumNulls());
            cs.setAvgColLen(csd.getStringStats().getAvgColLen());
        } else if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
            if (csd.getBooleanStats().getNumFalses() > 0 && csd.getBooleanStats().getNumTrues() > 0) {
                cs.setCountDistint(2);
            } else {
                cs.setCountDistint(1);
            }
            cs.setNumTrues(csd.getBooleanStats().getNumTrues());
            cs.setNumFalses(csd.getBooleanStats().getNumFalses());
            cs.setNumNulls(csd.getBooleanStats().getNumNulls());
            cs.setAvgColLen(JavaDataModel.get().primitive1());
        } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
            cs.setAvgColLen(csd.getBinaryStats().getAvgColLen());
            cs.setNumNulls(csd.getBinaryStats().getNumNulls());
        } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
            cs.setAvgColLen(JavaDataModel.get().lengthOfTimestamp());
        } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
            cs.setAvgColLen(JavaDataModel.get().lengthOfDecimal());
            cs.setCountDistint(csd.getDecimalStats().getNumDVs());
            cs.setNumNulls(csd.getDecimalStats().getNumNulls());
            Decimal val = csd.getDecimalStats().getHighValue();
            BigDecimal maxVal = HiveDecimal.create(new BigInteger(val.getUnscaled()), val.getScale())
                    .bigDecimalValue();
            val = csd.getDecimalStats().getLowValue();
            BigDecimal minVal = HiveDecimal.create(new BigInteger(val.getUnscaled()), val.getScale())
                    .bigDecimalValue();
            cs.setRange(minVal, maxVal);
        } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
            cs.setAvgColLen(JavaDataModel.get().lengthOfDate());
        } else {
            // Columns statistics for complex datatypes are not supported yet
            return null;
        }

        return cs;
    }

    /**
     * Get table level column statistics from metastore for needed columns
     * @param table
     *          - table
     * @param schema
     *          - output schema
     * @param neededColumns
     *          - list of needed columns
     * @return column statistics
     */
    public static List<ColStatistics> getTableColumnStats(Table table, List<ColumnInfo> schema,
            List<String> neededColumns) {
        String dbName = table.getDbName();
        String tabName = table.getTableName();
        List<String> neededColsInTable = processNeededColumns(schema, neededColumns);
        List<ColStatistics> stats = null;
        try {
            List<ColumnStatisticsObj> colStat = Hive.get().getTableColumnStatistics(dbName, tabName,
                    neededColsInTable);
            stats = convertColStats(colStat, tabName);
        } catch (HiveException e) {
            LOG.error("Failed to retrieve table statistics: ", e);
            stats = null;
        }
        return stats;
    }

    private static List<ColStatistics> convertColStats(List<ColumnStatisticsObj> colStats, String tabName) {
        List<ColStatistics> stats = new ArrayList<ColStatistics>(colStats.size());
        for (ColumnStatisticsObj statObj : colStats) {
            ColStatistics cs = getColStatistics(statObj, tabName, statObj.getColName());
            stats.add(cs);
        }
        return stats;
    }

    private static List<String> processNeededColumns(List<ColumnInfo> schema, List<String> neededColumns) {
        // Remove hidden virtual columns, as well as needed columns that are not
        // part of the table. TODO: the latter case should not really happen...
        List<String> neededColsInTable = null;
        int limit = neededColumns.size();
        for (int i = 0; i < limit; ++i) {
            if (neededColsInTable == null) {
                neededColsInTable = Lists.newArrayList(neededColumns);
            }
            neededColsInTable.remove(i--);
            --limit;
        }
        return (neededColsInTable == null || neededColsInTable.size() == 0) ? neededColumns : neededColsInTable;
    }

    /**
     * Get the raw data size of variable length data types
     * @param conf
     *          - hive conf
     * @param oi
     *          - object inspector
     * @param colType
     *          - column type
     * @return raw data size
     */
    public static long getAvgColLenOfVariableLengthTypes(HiveConf conf, ObjectInspector oi, String colType) {

        long configVarLen = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_MAX_VARIABLE_LENGTH);

        if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)) {

            // constant string projection Ex: select "hello" from table
            if (oi instanceof ConstantObjectInspector) {
                ConstantObjectInspector coi = (ConstantObjectInspector) oi;

                // if writable constant is null then return size 0
                if (coi.getWritableConstantValue() == null) {
                    return 0;
                }

                return coi.getWritableConstantValue().toString().length();
            } else if (oi instanceof WritableConstantStringObjectInspector) {

                // some UDFs return writable constant strings (fixed width)
                // Ex: select upper("hello") from table
                WritableConstantStringObjectInspector wcsoi = (WritableConstantStringObjectInspector) oi;

                return wcsoi.getWritableConstantValue().toString().length();
            } else if (oi instanceof WritableStringObjectInspector) {

                // some UDFs may emit strings of variable length. like pattern matching
                // UDFs. it's hard to find the length of such UDFs.
                // return the variable length from config
                return configVarLen;
            }
        } else if (colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)) {

            // constant varchar projection
            if (oi instanceof ConstantObjectInspector) {
                ConstantObjectInspector coi = (ConstantObjectInspector) oi;

                // if writable constant is null then return size 0
                if (coi.getWritableConstantValue() == null) {
                    return 0;
                }

                return coi.getWritableConstantValue().toString().length();
            } else if (oi instanceof WritableConstantHiveVarcharObjectInspector) {

                WritableConstantHiveVarcharObjectInspector wcsoi = (WritableConstantHiveVarcharObjectInspector) oi;
                return wcsoi.getWritableConstantValue().toString().length();
            } else if (oi instanceof WritableHiveVarcharObjectInspector) {
                return ((WritableHiveVarcharObjectInspector) oi).getMaxLength();
            }
        } else if (colType.startsWith(serdeConstants.CHAR_TYPE_NAME)) {

            // constant char projection
            if (oi instanceof ConstantObjectInspector) {
                ConstantObjectInspector coi = (ConstantObjectInspector) oi;

                // if writable constant is null then return size 0
                if (coi.getWritableConstantValue() == null) {
                    return 0;
                }

                return coi.getWritableConstantValue().toString().length();
            } else if (oi instanceof WritableConstantHiveCharObjectInspector) {

                WritableConstantHiveCharObjectInspector wcsoi = (WritableConstantHiveCharObjectInspector) oi;
                return wcsoi.getWritableConstantValue().toString().length();
            } else if (oi instanceof WritableHiveCharObjectInspector) {
                return ((WritableHiveCharObjectInspector) oi).getMaxLength();
            }
        } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {

            // constant byte arrays
            if (oi instanceof ConstantObjectInspector) {
                ConstantObjectInspector coi = (ConstantObjectInspector) oi;

                // if writable constant is null then return size 0
                if (coi.getWritableConstantValue() == null) {
                    return 0;
                }

                BytesWritable bw = ((BytesWritable) coi.getWritableConstantValue());
                return bw.getLength();
            } else if (oi instanceof WritableConstantBinaryObjectInspector) {

                // writable constant byte arrays
                WritableConstantBinaryObjectInspector wcboi = (WritableConstantBinaryObjectInspector) oi;

                return wcboi.getWritableConstantValue().getLength();
            } else if (oi instanceof WritableBinaryObjectInspector) {

                // return the variable length from config
                return configVarLen;
            }
        } else {

            // complex types (map, list, struct, union)
            return getSizeOfComplexTypes(conf, oi);
        }

        return 0;
    }

    /**
     * Get the size of complex data types
     * @param conf
     *          - hive conf
     * @param oi
     *          - object inspector
     * @return raw data size
     */
    public static long getSizeOfComplexTypes(HiveConf conf, ObjectInspector oi) {
        long result = 0;
        int length = 0;
        int listEntries = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_LIST_NUM_ENTRIES);
        int mapEntries = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_MAP_NUM_ENTRIES);

        switch (oi.getCategory()) {
        case PRIMITIVE:
            String colType = oi.getTypeName();
            if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
                    || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
                    || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
                int avgColLen = (int) getAvgColLenOfVariableLengthTypes(conf, oi, colType);
                result += JavaDataModel.get().lengthForStringOfLength(avgColLen);
            } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
                int avgColLen = (int) getAvgColLenOfVariableLengthTypes(conf, oi, colType);
                result += JavaDataModel.get().lengthForByteArrayOfSize(avgColLen);
            } else {
                result += getAvgColLenOfFixedLengthTypes(colType);
            }
            break;
        case LIST:
            if (oi instanceof StandardConstantListObjectInspector) {

                // constant list projection of known length
                StandardConstantListObjectInspector scloi = (StandardConstantListObjectInspector) oi;
                length = scloi.getWritableConstantValue().size();

                // check if list elements are primitive or Objects
                ObjectInspector leoi = scloi.getListElementObjectInspector();
                if (leoi.getCategory().equals(ObjectInspector.Category.PRIMITIVE)) {
                    result += getSizeOfPrimitiveTypeArraysFromType(leoi.getTypeName(), length);
                } else {
                    result += JavaDataModel.get().lengthForObjectArrayOfSize(length);
                }
            } else {
                StandardListObjectInspector sloi = (StandardListObjectInspector) oi;

                // list overhead + (configured number of element in list * size of element)
                long elemSize = getSizeOfComplexTypes(conf, sloi.getListElementObjectInspector());
                result += JavaDataModel.get().arrayList() + (listEntries * elemSize);
            }
            break;
        case MAP:
            if (oi instanceof StandardConstantMapObjectInspector) {

                // constant map projection of known length
                StandardConstantMapObjectInspector scmoi = (StandardConstantMapObjectInspector) oi;
                result += getSizeOfMap(scmoi);
            } else {
                StandardMapObjectInspector smoi = (StandardMapObjectInspector) oi;
                result += getSizeOfComplexTypes(conf, smoi.getMapKeyObjectInspector());
                result += getSizeOfComplexTypes(conf, smoi.getMapValueObjectInspector());

                // hash map overhead
                result += JavaDataModel.get().hashMap(mapEntries);
            }
            break;
        case STRUCT:
            StructObjectInspector soi = (StructObjectInspector) oi;

            // add constant object overhead for struct
            result += JavaDataModel.get().object();

            // add constant struct field names references overhead
            result += soi.getAllStructFieldRefs().size() * JavaDataModel.get().ref();
            for (StructField field : soi.getAllStructFieldRefs()) {
                result += getSizeOfComplexTypes(conf, field.getFieldObjectInspector());
            }
            break;
        case UNION:
            UnionObjectInspector uoi = (UnionObjectInspector) oi;

            // add constant object overhead for union
            result += JavaDataModel.get().object();

            // add constant size for unions tags
            result += uoi.getObjectInspectors().size() * JavaDataModel.get().primitive1();
            for (ObjectInspector foi : uoi.getObjectInspectors()) {
                result += getSizeOfComplexTypes(conf, foi);
            }
            break;
        default:
            break;
        }

        return result;
    }

    /**
     * Get size of fixed length primitives
     * @param colType
     *          - column type
     * @return raw data size
     */
    public static long getAvgColLenOfFixedLengthTypes(String colType) {
        if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
                || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
                || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)
                || colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)
                || colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)) {
            return JavaDataModel.get().primitive1();
        } else if (colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)
                || colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME) || colType.equalsIgnoreCase("long")) {
            return JavaDataModel.get().primitive2();
        } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
            return JavaDataModel.get().lengthOfTimestamp();
        } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
            return JavaDataModel.get().lengthOfDate();
        } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
            return JavaDataModel.get().lengthOfDecimal();
        } else {
            return 0;
        }
    }

    /**
     * Get the size of arrays of primitive types
     * @param colType
     *          - column type
     * @param length
     *          - array length
     * @return raw data size
     */
    public static long getSizeOfPrimitiveTypeArraysFromType(String colType, int length) {
        if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
                || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
                || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)
                || colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)) {
            return JavaDataModel.get().lengthForIntArrayOfSize(length);
        } else if (colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) {
            return JavaDataModel.get().lengthForDoubleArrayOfSize(length);
        } else if (colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME) || colType.equalsIgnoreCase("long")) {
            return JavaDataModel.get().lengthForLongArrayOfSize(length);
        } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
            return JavaDataModel.get().lengthForByteArrayOfSize(length);
        } else if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
            return JavaDataModel.get().lengthForBooleanArrayOfSize(length);
        } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
            return JavaDataModel.get().lengthForTimestampArrayOfSize(length);
        } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
            return JavaDataModel.get().lengthForDateArrayOfSize(length);
        } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
            return JavaDataModel.get().lengthForDecimalArrayOfSize(length);
        } else {
            return 0;
        }
    }

    /**
     * Estimate the size of map object
     * @param scmoi
     *          - object inspector
     * @return size of map
     */
    public static long getSizeOfMap(StandardConstantMapObjectInspector scmoi) {
        Map<?, ?> map = scmoi.getWritableConstantValue();
        ObjectInspector koi = scmoi.getMapKeyObjectInspector();
        ObjectInspector voi = scmoi.getMapValueObjectInspector();
        long result = 0;
        for (Map.Entry<?, ?> entry : map.entrySet()) {
            result += getWritableSize(koi, entry.getKey());
            result += getWritableSize(voi, entry.getValue());
        }

        // add additional overhead of each map entries
        result += JavaDataModel.get().hashMap(map.entrySet().size());
        return result;
    }

    /**
     * Get size of primitive data types based on their respective writable object inspector
     * @param oi
     *          - object inspector
     * @param value
     *          - value
     * @return raw data size
     */
    public static long getWritableSize(ObjectInspector oi, Object value) {
        if (oi instanceof WritableStringObjectInspector) {
            WritableStringObjectInspector woi = (WritableStringObjectInspector) oi;
            return JavaDataModel.get().lengthForStringOfLength(woi.getPrimitiveWritableObject(value).getLength());
        } else if (oi instanceof WritableBinaryObjectInspector) {
            WritableBinaryObjectInspector woi = (WritableBinaryObjectInspector) oi;
            return JavaDataModel.get().lengthForByteArrayOfSize(woi.getPrimitiveWritableObject(value).getLength());
        } else if (oi instanceof WritableBooleanObjectInspector) {
            return JavaDataModel.get().primitive1();
        } else if (oi instanceof WritableByteObjectInspector) {
            return JavaDataModel.get().primitive1();
        } else if (oi instanceof WritableDateObjectInspector) {
            return JavaDataModel.get().lengthOfDate();
        } else if (oi instanceof WritableDoubleObjectInspector) {
            return JavaDataModel.get().primitive2();
        } else if (oi instanceof WritableFloatObjectInspector) {
            return JavaDataModel.get().primitive1();
        } else if (oi instanceof WritableHiveDecimalObjectInspector) {
            return JavaDataModel.get().lengthOfDecimal();
        } else if (oi instanceof WritableIntObjectInspector) {
            return JavaDataModel.get().primitive1();
        } else if (oi instanceof WritableLongObjectInspector) {
            return JavaDataModel.get().primitive2();
        } else if (oi instanceof WritableShortObjectInspector) {
            return JavaDataModel.get().primitive1();
        } else if (oi instanceof WritableTimestampObjectInspector) {
            return JavaDataModel.get().lengthOfTimestamp();
        }

        return 0;
    }

    /**
     * Get column statistics from parent statistics.
     * @param conf
     *          - hive conf
     * @param parentStats
     *          - parent statistics
     * @param colExprMap
     *          - column expression map
     * @param rowSchema
     *          - row schema
     * @return column statistics
     */
    public static List<ColStatistics> getColStatisticsFromExprMap(HiveConf conf, Statistics parentStats,
            Map<String, ExprNodeDesc> colExprMap, RowSchema rowSchema) {

        List<ColStatistics> cs = Lists.newArrayList();
        if (colExprMap != null && rowSchema != null) {
            for (ColumnInfo ci : rowSchema.getSignature()) {
                String outColName = ci.getInternalName();
                ExprNodeDesc end = colExprMap.get(outColName);
                ColStatistics colStat = getColStatisticsFromExpression(conf, parentStats, end);
                if (colStat != null) {
                    colStat.setColumnName(outColName);
                    cs.add(colStat);
                }
            }
            // sometimes RowSchema is empty, so fetch stats of columns in exprMap
            for (Entry<String, ExprNodeDesc> pair : colExprMap.entrySet()) {
                if (rowSchema.getColumnInfo(pair.getKey()) == null) {
                    ColStatistics colStat = getColStatisticsFromExpression(conf, parentStats, pair.getValue());
                    if (colStat != null) {
                        colStat.setColumnName(pair.getKey());
                        cs.add(colStat);
                    }
                }
            }

            return cs;
        }

        // In cases where column expression map or row schema is missing, just pass on the parent column
        // stats. This could happen in cases like TS -> FIL where FIL does not map input column names to
        // internal names.
        if (colExprMap == null || rowSchema == null) {
            if (parentStats.getColumnStats() != null) {
                cs.addAll(parentStats.getColumnStats());
            }
        }
        return cs;
    }

    /**
     * Get column statistics from parent statistics given the
     * row schema of its child.
     * @param parentStats
     *          - parent statistics
     * @param rowSchema
     *          - row schema
     * @return column statistics
     */
    public static List<ColStatistics> getColStatisticsUpdatingTableAlias(Statistics parentStats,
            RowSchema rowSchema) {

        List<ColStatistics> cs = Lists.newArrayList();

        for (ColStatistics parentColStat : parentStats.getColumnStats()) {
            ColStatistics colStat;
            try {
                colStat = parentColStat.clone();
            } catch (CloneNotSupportedException e) {
                colStat = null;
            }
            if (colStat != null) {
                cs.add(colStat);
            }
        }

        return cs;
    }

    /**
     * Get column statistics expression nodes
     * @param conf
     *          - hive conf
     * @param parentStats
     *          - parent statistics
     * @param end
     *          - expression nodes
     * @return column statistics
     */
    public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statistics parentStats,
            ExprNodeDesc end) {

        if (end == null) {
            return null;
        }

        String colName = null;
        String colType = null;
        double avgColSize = 0;
        long countDistincts = 0;
        long numNulls = 0;
        ObjectInspector oi = null;
        long numRows = parentStats.getNumRows();

        if (end instanceof ExprNodeColumnDesc) {
            // column projection
            ExprNodeColumnDesc encd = (ExprNodeColumnDesc) end;
            colName = encd.getColumn();

            if (encd.getIsPartitionColOrVirtualCol()) {

                ColStatistics colStats = parentStats.getColumnStatisticsFromColName(colName);
                if (colStats != null) {
                    /* If statistics for the column already exist use it. */
                    return colStats;
                }

                // virtual columns
                colType = encd.getTypeInfo().getTypeName();
                countDistincts = numRows;
                oi = encd.getWritableObjectInspector();
            } else {

                // clone the column stats and return
                ColStatistics result = parentStats.getColumnStatisticsFromColName(colName);
                if (result != null) {
                    try {
                        return result.clone();
                    } catch (CloneNotSupportedException e) {
                        return null;
                    }
                }
                return null;
            }
        } else if (end instanceof ExprNodeConstantDesc) {

            // constant projection
            ExprNodeConstantDesc encd = (ExprNodeConstantDesc) end;

            // null projection
            if (encd.getValue() == null) {
                colName = encd.getName();
                colType = "null";
                numNulls = numRows;
            } else {
                colName = encd.getName();
                colType = encd.getTypeString();
                countDistincts = 1;
                oi = encd.getWritableObjectInspector();
            }
        } else if (end instanceof ExprNodeGenericFuncDesc) {

            // udf projection
            ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc) end;
            colName = engfd.getName();
            colType = engfd.getTypeString();
            countDistincts = numRows;
            oi = engfd.getWritableObjectInspector();
        } else if (end instanceof ExprNodeColumnListDesc) {

            // column list
            ExprNodeColumnListDesc encd = (ExprNodeColumnListDesc) end;
            colName = Joiner.on(",").join(encd.getCols());
            colType = "array";
            countDistincts = numRows;
            oi = encd.getWritableObjectInspector();
        } else if (end instanceof ExprNodeFieldDesc) {

            // field within complex type
            ExprNodeFieldDesc enfd = (ExprNodeFieldDesc) end;
            colName = enfd.getFieldName();
            colType = enfd.getTypeString();
            countDistincts = numRows;
            oi = enfd.getWritableObjectInspector();
        }

        if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
                || colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)
                || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
                || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)
                || colType.startsWith(serdeConstants.LIST_TYPE_NAME)
                || colType.startsWith(serdeConstants.MAP_TYPE_NAME)
                || colType.startsWith(serdeConstants.STRUCT_TYPE_NAME)
                || colType.startsWith(serdeConstants.UNION_TYPE_NAME)) {
            avgColSize = getAvgColLenOfVariableLengthTypes(conf, oi, colType);
        } else {
            avgColSize = getAvgColLenOfFixedLengthTypes(colType);
        }

        ColStatistics colStats = new ColStatistics(colName, colType);
        colStats.setAvgColLen(avgColSize);
        colStats.setCountDistint(countDistincts);
        colStats.setNumNulls(numNulls);

        return colStats;
    }

    /**
     * Get number of rows of a give table
     * @return number of rows
     */
    public static long getNumRows(Table table) {
        return getBasicStatForTable(table, StatsSetupConst.ROW_COUNT);
    }

    /**
     * Get raw data size of a give table
     * @return raw data size
     */
    public static long getRawDataSize(Table table) {
        return getBasicStatForTable(table, StatsSetupConst.RAW_DATA_SIZE);
    }

    /**
     * Get total size of a give table
     * @return total size
     */
    public static long getTotalSize(Table table) {
        return getBasicStatForTable(table, StatsSetupConst.TOTAL_SIZE);
    }

    /**
     * Get basic stats of table
     * @param table
     *          - table
     * @param statType
     *          - type of stats
     * @return value of stats
     */
    public static long getBasicStatForTable(Table table, String statType) {
        Map<String, String> params = table.getParameters();
        long result = 0;

        if (params != null) {
            try {
                result = Long.parseLong(params.get(statType));
            } catch (NumberFormatException e) {
                result = 0;
            }
        }
        return result;
    }

    /**
     * Get basic stats of partitions
     * @param table
     *          - table
     * @param parts
     *          - partitions
     * @param statType
     *          - type of stats
     * @return value of stats
     */
    public static List<Long> getBasicStatForPartitions(Table table, List<Partition> parts, String statType) {

        List<Long> stats = Lists.newArrayList();
        for (Partition part : parts) {
            Map<String, String> params = part.getParameters();
            long result = 0;
            if (params != null) {
                try {
                    result = Long.parseLong(params.get(statType));
                } catch (NumberFormatException e) {
                    result = 0;
                }
                stats.add(result);
            }
        }
        return stats;
    }

    /**
     * Compute raw data size from column statistics
     * @param numRows
     *          - number of rows
     * @param colStats
     *          - column statistics
     * @return raw data size
     */
    public static long getDataSizeFromColumnStats(long numRows, List<ColStatistics> colStats) {
        long result = 0;

        if (numRows <= 0) {
            return result;
        }

        for (ColStatistics cs : colStats) {
            if (cs != null) {
                String colType = cs.getColumnType();
                long nonNullCount = numRows - cs.getNumNulls();
                double sizeOf = 0;
                if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
                        || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
                        || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)
                        || colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)
                        || colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)
                        || colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)
                        || colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) {
                    sizeOf = cs.getAvgColLen();
                } else if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
                        || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
                        || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
                    int acl = (int) Math.round(cs.getAvgColLen());
                    sizeOf = JavaDataModel.get().lengthForStringOfLength(acl);
                } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
                    int acl = (int) Math.round(cs.getAvgColLen());
                    sizeOf = JavaDataModel.get().lengthForByteArrayOfSize(acl);
                } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
                    sizeOf = JavaDataModel.get().lengthOfTimestamp();
                } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
                    sizeOf = JavaDataModel.get().lengthOfDecimal();
                } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
                    sizeOf = JavaDataModel.get().lengthOfDate();
                } else {
                    sizeOf = cs.getAvgColLen();
                }
                result = safeAdd(result, safeMult(nonNullCount, sizeOf));
            }
        }

        return result;
    }

    public static String getFullyQualifiedTableName(String dbName, String tabName) {
        return getFullyQualifiedName(dbName, tabName);
    }

    private static String getFullyQualifiedName(String... names) {
        List<String> nonNullAndEmptyNames = Lists.newArrayList();
        for (String name : names) {
            if (name != null && !name.isEmpty()) {
                nonNullAndEmptyNames.add(name);
            }
        }
        return Joiner.on(".").join(nonNullAndEmptyNames);
    }

    /**
     * Get qualified column name from output key column names
     * @param keyExprs
     *          - output key names
     * @return list of qualified names
     */
    public static List<String> getQualifedReducerKeyNames(List<String> keyExprs) {
        List<String> result = Lists.newArrayList();
        if (keyExprs != null) {
            for (String key : keyExprs) {
                result.add(Utilities.ReduceField.KEY.toString() + "." + key);
            }
        }
        return result;
    }

    public static long getAvailableMemory(Configuration conf) {
        int memory = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVETEZCONTAINERSIZE);
        if (memory <= 0) {
            memory = conf.getInt(MRJobConfig.MAP_MEMORY_MB, MRJobConfig.DEFAULT_MAP_MEMORY_MB);
            if (memory <= 0) {
                memory = 1024;
            }
        }
        return memory;
    }

    /**
     * negative number of rows or data sizes are invalid. It could be because of
     * long overflow in which case return Long.MAX_VALUE
     * @param val - input value
     * @return Long.MAX_VALUE if val is negative else val
     */
    public static long getMaxIfOverflow(long val) {
        return val < 0 ? Long.MAX_VALUE : val;
    }

    /** Bounded multiplication - overflows become MAX_VALUE */
    public static long safeMult(long a, double b) {
        double result = a * b;
        return (result > Long.MAX_VALUE) ? Long.MAX_VALUE : (long) result;
    }

    /** Bounded addition - overflows become MAX_VALUE */
    public static long safeAdd(long a, long b) {
        try {
            return LongMath.checkedAdd(a, b);
        } catch (ArithmeticException ex) {
            return Long.MAX_VALUE;
        }
    }

    /** Bounded multiplication - overflows become MAX_VALUE */
    public static long safeMult(long a, long b) {
        try {
            return LongMath.checkedMultiply(a, b);
        } catch (ArithmeticException ex) {
            return Long.MAX_VALUE;
        }
    }
}