Java tutorial
/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io.prestosql.plugin.hive.statistics; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.VerifyException; import com.google.common.collect.ImmutableMap; import com.google.common.hash.HashFunction; import com.google.common.primitives.Ints; import com.google.common.primitives.Shorts; import com.google.common.primitives.SignedBytes; import io.airlift.log.Logger; import io.airlift.slice.Slice; import io.prestosql.plugin.hive.HiveBasicStatistics; import io.prestosql.plugin.hive.HiveColumnHandle; import io.prestosql.plugin.hive.HivePartition; import io.prestosql.plugin.hive.PartitionStatistics; import io.prestosql.plugin.hive.metastore.DateStatistics; import io.prestosql.plugin.hive.metastore.DecimalStatistics; import io.prestosql.plugin.hive.metastore.DoubleStatistics; import io.prestosql.plugin.hive.metastore.HiveColumnStatistics; import io.prestosql.plugin.hive.metastore.IntegerStatistics; import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; import io.prestosql.spi.PrestoException; import io.prestosql.spi.connector.ColumnHandle; import io.prestosql.spi.connector.ConnectorSession; import io.prestosql.spi.connector.SchemaTableName; import io.prestosql.spi.predicate.NullableValue; import io.prestosql.spi.statistics.ColumnStatistics; import io.prestosql.spi.statistics.DoubleRange; import io.prestosql.spi.statistics.Estimate; import io.prestosql.spi.statistics.TableStatistics; import io.prestosql.spi.type.DecimalType; import io.prestosql.spi.type.Decimals; import io.prestosql.spi.type.Type; import java.math.BigDecimal; import java.time.LocalDate; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.OptionalDouble; import java.util.OptionalLong; import java.util.Set; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Verify.verify; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableSet.toImmutableSet; import static com.google.common.collect.Maps.immutableEntry; import static com.google.common.hash.Hashing.murmur3_128; import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS; import static io.prestosql.plugin.hive.HivePartition.UNPARTITIONED_ID; import static io.prestosql.plugin.hive.HiveSessionProperties.getPartitionStatisticsSampleSize; import static io.prestosql.plugin.hive.HiveSessionProperties.isIgnoreCorruptedStatistics; import static io.prestosql.plugin.hive.HiveSessionProperties.isStatisticsEnabled; import static io.prestosql.spi.type.BigintType.BIGINT; import static io.prestosql.spi.type.Chars.isCharType; import static io.prestosql.spi.type.DateType.DATE; import static io.prestosql.spi.type.Decimals.isLongDecimal; import static io.prestosql.spi.type.Decimals.isShortDecimal; import static io.prestosql.spi.type.DoubleType.DOUBLE; import static io.prestosql.spi.type.IntegerType.INTEGER; import static io.prestosql.spi.type.RealType.REAL; import static io.prestosql.spi.type.SmallintType.SMALLINT; import static io.prestosql.spi.type.TinyintType.TINYINT; import static io.prestosql.spi.type.Varchars.isVarcharType; import static java.lang.Double.isFinite; import static java.lang.Double.isNaN; import static java.lang.Double.parseDouble; import static java.lang.Float.intBitsToFloat; import static java.lang.String.format; import static java.util.Collections.unmodifiableList; import static java.util.Objects.requireNonNull; public class MetastoreHiveStatisticsProvider implements HiveStatisticsProvider { private static final Logger log = Logger.get(MetastoreHiveStatisticsProvider.class); private final PartitionsStatisticsProvider statisticsProvider; public MetastoreHiveStatisticsProvider(SemiTransactionalHiveMetastore metastore) { requireNonNull(metastore, "metastore is null"); this.statisticsProvider = (table, hivePartitions) -> getPartitionsStatistics(metastore, table, hivePartitions); } @VisibleForTesting MetastoreHiveStatisticsProvider(PartitionsStatisticsProvider statisticsProvider) { this.statisticsProvider = requireNonNull(statisticsProvider, "statisticsProvider is null"); } private static Map<String, PartitionStatistics> getPartitionsStatistics( SemiTransactionalHiveMetastore metastore, SchemaTableName table, List<HivePartition> hivePartitions) { if (hivePartitions.isEmpty()) { return ImmutableMap.of(); } boolean unpartitioned = hivePartitions.stream() .anyMatch(partition -> partition.getPartitionId().equals(UNPARTITIONED_ID)); if (unpartitioned) { checkArgument(hivePartitions.size() == 1, "expected only one hive partition"); return ImmutableMap.of(UNPARTITIONED_ID, metastore.getTableStatistics(table.getSchemaName(), table.getTableName())); } Set<String> partitionNames = hivePartitions.stream().map(HivePartition::getPartitionId) .collect(toImmutableSet()); return metastore.getPartitionStatistics(table.getSchemaName(), table.getTableName(), partitionNames); } @Override public TableStatistics getTableStatistics(ConnectorSession session, SchemaTableName table, Map<String, ColumnHandle> columns, Map<String, Type> columnTypes, List<HivePartition> partitions) { if (!isStatisticsEnabled(session)) { return TableStatistics.empty(); } if (partitions.isEmpty()) { return createZeroStatistics(columns, columnTypes); } int sampleSize = getPartitionStatisticsSampleSize(session); List<HivePartition> partitionsSample = getPartitionsSample(partitions, sampleSize); try { Map<String, PartitionStatistics> statisticsSample = statisticsProvider.getPartitionsStatistics(table, partitionsSample); validatePartitionStatistics(table, statisticsSample); return getTableStatistics(columns, columnTypes, partitions, statisticsSample); } catch (PrestoException e) { if (e.getErrorCode().equals(HIVE_CORRUPTED_COLUMN_STATISTICS.toErrorCode()) && isIgnoreCorruptedStatistics(session)) { log.error(e); return TableStatistics.empty(); } throw e; } } private TableStatistics createZeroStatistics(Map<String, ColumnHandle> columns, Map<String, Type> columnTypes) { TableStatistics.Builder result = TableStatistics.builder(); result.setRowCount(Estimate.of(0)); columns.forEach((columnName, columnHandle) -> { Type columnType = columnTypes.get(columnName); verify(columnType != null, "columnType is missing for column: %s", columnName); ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder(); columnStatistics.setNullsFraction(Estimate.of(0)); columnStatistics.setDistinctValuesCount(Estimate.of(0)); if (hasDataSize(columnType)) { columnStatistics.setDataSize(Estimate.of(0)); } result.setColumnStatistics(columnHandle, columnStatistics.build()); }); return result.build(); } @VisibleForTesting static List<HivePartition> getPartitionsSample(List<HivePartition> partitions, int sampleSize) { checkArgument(sampleSize > 0, "sampleSize is expected to be greater than zero"); if (partitions.size() <= sampleSize) { return partitions; } List<HivePartition> result = new ArrayList<>(); int samplesLeft = sampleSize; HivePartition min = partitions.get(0); HivePartition max = partitions.get(0); for (HivePartition partition : partitions) { if (partition.getPartitionId().compareTo(min.getPartitionId()) < 0) { min = partition; } else if (partition.getPartitionId().compareTo(max.getPartitionId()) > 0) { max = partition; } } result.add(min); samplesLeft--; if (samplesLeft > 0) { result.add(max); samplesLeft--; } if (samplesLeft > 0) { HashFunction hashFunction = murmur3_128(); Comparator<Map.Entry<HivePartition, Long>> hashComparator = Comparator.<Map.Entry<HivePartition, Long>, Long>comparing( Map.Entry::getValue).thenComparing(entry -> entry.getKey().getPartitionId()); partitions.stream().filter(partition -> !result.contains(partition)) .map(partition -> immutableEntry(partition, hashFunction.hashUnencodedChars(partition.getPartitionId()).asLong())) .sorted(hashComparator).limit(samplesLeft).forEachOrdered(entry -> result.add(entry.getKey())); } return unmodifiableList(result); } @VisibleForTesting static void validatePartitionStatistics(SchemaTableName table, Map<String, PartitionStatistics> partitionStatistics) { partitionStatistics.forEach((partition, statistics) -> { HiveBasicStatistics basicStatistics = statistics.getBasicStatistics(); OptionalLong rowCount = basicStatistics.getRowCount(); rowCount.ifPresent(count -> checkStatistics(count >= 0, table, partition, "rowCount must be greater than or equal to zero: %s", count)); basicStatistics.getFileCount().ifPresent(count -> checkStatistics(count >= 0, table, partition, "fileCount must be greater than or equal to zero: %s", count)); basicStatistics.getInMemoryDataSizeInBytes().ifPresent(size -> checkStatistics(size >= 0, table, partition, "inMemoryDataSizeInBytes must be greater than or equal to zero: %s", size)); basicStatistics.getOnDiskDataSizeInBytes().ifPresent(size -> checkStatistics(size >= 0, table, partition, "onDiskDataSizeInBytes must be greater than or equal to zero: %s", size)); statistics.getColumnStatistics().forEach((column, columnStatistics) -> validateColumnStatistics(table, partition, column, rowCount, columnStatistics)); }); } private static void validateColumnStatistics(SchemaTableName table, String partition, String column, OptionalLong rowCount, HiveColumnStatistics columnStatistics) { columnStatistics.getMaxValueSizeInBytes() .ifPresent(maxValueSizeInBytes -> checkStatistics(maxValueSizeInBytes >= 0, table, partition, column, "maxValueSizeInBytes must be greater than or equal to zero: %s", maxValueSizeInBytes)); columnStatistics.getTotalSizeInBytes() .ifPresent(totalSizeInBytes -> checkStatistics(totalSizeInBytes >= 0, table, partition, column, "totalSizeInBytes must be greater than or equal to zero: %s", totalSizeInBytes)); columnStatistics.getNullsCount().ifPresent(nullsCount -> { checkStatistics(nullsCount >= 0, table, partition, column, "nullsCount must be greater than or equal to zero: %s", nullsCount); if (rowCount.isPresent()) { checkStatistics(nullsCount <= rowCount.getAsLong(), table, partition, column, "nullsCount must be less than or equal to rowCount. nullsCount: %s. rowCount: %s.", nullsCount, rowCount.getAsLong()); } }); columnStatistics.getDistinctValuesCount().ifPresent(distinctValuesCount -> { checkStatistics(distinctValuesCount >= 0, table, partition, column, "distinctValuesCount must be greater than or equal to zero: %s", distinctValuesCount); if (rowCount.isPresent()) { checkStatistics(distinctValuesCount <= rowCount.getAsLong(), table, partition, column, "distinctValuesCount must be less than or equal to rowCount. distinctValuesCount: %s. rowCount: %s.", distinctValuesCount, rowCount.getAsLong()); } if (rowCount.isPresent() && columnStatistics.getNullsCount().isPresent()) { long nonNullsCount = rowCount.getAsLong() - columnStatistics.getNullsCount().getAsLong(); checkStatistics(distinctValuesCount <= nonNullsCount, table, partition, column, "distinctValuesCount must be less than or equal to nonNullsCount. distinctValuesCount: %s. nonNullsCount: %s.", distinctValuesCount, nonNullsCount); } }); columnStatistics.getIntegerStatistics().ifPresent(integerStatistics -> { OptionalLong min = integerStatistics.getMin(); OptionalLong max = integerStatistics.getMax(); if (min.isPresent() && max.isPresent()) { checkStatistics(min.getAsLong() <= max.getAsLong(), table, partition, column, "integerStatistics.min must be less than or equal to integerStatistics.max. integerStatistics.min: %s. integerStatistics.max: %s.", min.getAsLong(), max.getAsLong()); } }); columnStatistics.getDoubleStatistics().ifPresent(doubleStatistics -> { OptionalDouble min = doubleStatistics.getMin(); OptionalDouble max = doubleStatistics.getMax(); if (min.isPresent() && max.isPresent() && !isNaN(min.getAsDouble()) && !isNaN(max.getAsDouble())) { checkStatistics(min.getAsDouble() <= max.getAsDouble(), table, partition, column, "doubleStatistics.min must be less than or equal to doubleStatistics.max. doubleStatistics.min: %s. doubleStatistics.max: %s.", min.getAsDouble(), max.getAsDouble()); } }); columnStatistics.getDecimalStatistics().ifPresent(decimalStatistics -> { Optional<BigDecimal> min = decimalStatistics.getMin(); Optional<BigDecimal> max = decimalStatistics.getMax(); if (min.isPresent() && max.isPresent()) { checkStatistics(min.get().compareTo(max.get()) <= 0, table, partition, column, "decimalStatistics.min must be less than or equal to decimalStatistics.max. decimalStatistics.min: %s. decimalStatistics.max: %s.", min.get(), max.get()); } }); columnStatistics.getDateStatistics().ifPresent(dateStatistics -> { Optional<LocalDate> min = dateStatistics.getMin(); Optional<LocalDate> max = dateStatistics.getMax(); if (min.isPresent() && max.isPresent()) { checkStatistics(min.get().compareTo(max.get()) <= 0, table, partition, column, "dateStatistics.min must be less than or equal to dateStatistics.max. dateStatistics.min: %s. dateStatistics.max: %s.", min.get(), max.get()); } }); columnStatistics.getBooleanStatistics().ifPresent(booleanStatistics -> { OptionalLong falseCount = booleanStatistics.getFalseCount(); OptionalLong trueCount = booleanStatistics.getTrueCount(); falseCount.ifPresent(count -> checkStatistics(count >= 0, table, partition, column, "falseCount must be greater than or equal to zero: %s", count)); trueCount.ifPresent(count -> checkStatistics(count >= 0, table, partition, column, "trueCount must be greater than or equal to zero: %s", count)); if (rowCount.isPresent() && falseCount.isPresent()) { checkStatistics(falseCount.getAsLong() <= rowCount.getAsLong(), table, partition, column, "booleanStatistics.falseCount must be less than or equal to rowCount. booleanStatistics.falseCount: %s. rowCount: %s.", falseCount.getAsLong(), rowCount.getAsLong()); } if (rowCount.isPresent() && trueCount.isPresent()) { checkStatistics(trueCount.getAsLong() <= rowCount.getAsLong(), table, partition, column, "booleanStatistics.trueCount must be less than or equal to rowCount. booleanStatistics.trueCount: %s. rowCount: %s.", trueCount.getAsLong(), rowCount.getAsLong()); } }); } private static void checkStatistics(boolean expression, SchemaTableName table, String partition, String column, String message, Object... args) { if (!expression) { throw new PrestoException(HIVE_CORRUPTED_COLUMN_STATISTICS, format("Corrupted partition statistics (Table: %s Partition: [%s] Column: %s): %s", table, partition, column, format(message, args))); } } private static void checkStatistics(boolean expression, SchemaTableName table, String partition, String message, Object... args) { if (!expression) { throw new PrestoException(HIVE_CORRUPTED_COLUMN_STATISTICS, format("Corrupted partition statistics (Table: %s Partition: [%s]): %s", table, partition, format(message, args))); } } private static TableStatistics getTableStatistics(Map<String, ColumnHandle> columns, Map<String, Type> columnTypes, List<HivePartition> partitions, Map<String, PartitionStatistics> statistics) { if (statistics.isEmpty()) { return TableStatistics.empty(); } checkArgument(!partitions.isEmpty(), "partitions is empty"); OptionalDouble optionalAverageRowsPerPartition = calculateAverageRowsPerPartition(statistics.values()); if (!optionalAverageRowsPerPartition.isPresent()) { return TableStatistics.empty(); } double averageRowsPerPartition = optionalAverageRowsPerPartition.getAsDouble(); verify(averageRowsPerPartition >= 0, "averageRowsPerPartition must be greater than or equal to zero"); int queriedPartitionsCount = partitions.size(); double rowCount = averageRowsPerPartition * queriedPartitionsCount; TableStatistics.Builder result = TableStatistics.builder(); result.setRowCount(Estimate.of(rowCount)); for (Map.Entry<String, ColumnHandle> column : columns.entrySet()) { String columnName = column.getKey(); HiveColumnHandle columnHandle = (HiveColumnHandle) column.getValue(); Type columnType = columnTypes.get(columnName); ColumnStatistics columnStatistics; if (columnHandle.isPartitionKey()) { columnStatistics = createPartitionColumnStatistics(columnHandle, columnType, partitions, statistics, averageRowsPerPartition, rowCount); } else { columnStatistics = createDataColumnStatistics(columnName, columnType, rowCount, statistics.values()); } result.setColumnStatistics(columnHandle, columnStatistics); } return result.build(); } @VisibleForTesting static OptionalDouble calculateAverageRowsPerPartition(Collection<PartitionStatistics> statistics) { return statistics.stream().map(PartitionStatistics::getBasicStatistics) .map(HiveBasicStatistics::getRowCount).filter(OptionalLong::isPresent) .mapToLong(OptionalLong::getAsLong) .peek(count -> verify(count >= 0, "count must be greater than or equal to zero")).average(); } private static ColumnStatistics createPartitionColumnStatistics(HiveColumnHandle column, Type type, List<HivePartition> partitions, Map<String, PartitionStatistics> statistics, double averageRowsPerPartition, double rowCount) { List<HivePartition> nonEmptyPartitions = partitions.stream() .filter(partition -> getPartitionRowCount(partition.getPartitionId(), statistics) .orElse(averageRowsPerPartition) != 0) .collect(toImmutableList()); return ColumnStatistics.builder() .setDistinctValuesCount(Estimate.of(calculateDistinctPartitionKeys(column, nonEmptyPartitions))) .setNullsFraction(Estimate.of(calculateNullsFractionForPartitioningKey(column, partitions, statistics, averageRowsPerPartition, rowCount))) .setRange(calculateRangeForPartitioningKey(column, type, nonEmptyPartitions)) .setDataSize(calculateDataSizeForPartitioningKey(column, type, partitions, statistics, averageRowsPerPartition)) .build(); } @VisibleForTesting static long calculateDistinctPartitionKeys(HiveColumnHandle column, List<HivePartition> partitions) { return partitions.stream().map(partition -> partition.getKeys().get(column)) .filter(value -> !value.isNull()).distinct().count(); } @VisibleForTesting static double calculateNullsFractionForPartitioningKey(HiveColumnHandle column, List<HivePartition> partitions, Map<String, PartitionStatistics> statistics, double averageRowsPerPartition, double rowCount) { if (rowCount == 0) { return 0; } double estimatedNullsCount = partitions.stream() .filter(partition -> partition.getKeys().get(column).isNull()).map(HivePartition::getPartitionId) .mapToDouble(partitionName -> getPartitionRowCount(partitionName, statistics) .orElse(averageRowsPerPartition)) .sum(); return normalizeFraction(estimatedNullsCount / rowCount); } private static double normalizeFraction(double fraction) { checkArgument(!isNaN(fraction), "fraction is NaN"); checkArgument(isFinite(fraction), "fraction must be finite"); if (fraction < 0) { return 0; } if (fraction > 1) { return 1; } return fraction; } @VisibleForTesting static Estimate calculateDataSizeForPartitioningKey(HiveColumnHandle column, Type type, List<HivePartition> partitions, Map<String, PartitionStatistics> statistics, double averageRowsPerPartition) { if (!hasDataSize(type)) { return Estimate.unknown(); } double dataSize = 0; for (HivePartition partition : partitions) { int length = getSize(partition.getKeys().get(column)); double rowCount = getPartitionRowCount(partition.getPartitionId(), statistics) .orElse(averageRowsPerPartition); dataSize += length * rowCount; } return Estimate.of(dataSize); } private static boolean hasDataSize(Type type) { return isVarcharType(type) || isCharType(type); } private static int getSize(NullableValue nullableValue) { if (nullableValue.isNull()) { return 0; } Object value = nullableValue.getValue(); checkArgument(value instanceof Slice, "value is expected to be of Slice type"); return ((Slice) value).length(); } private static OptionalDouble getPartitionRowCount(String partitionName, Map<String, PartitionStatistics> statistics) { PartitionStatistics partitionStatistics = statistics.get(partitionName); if (partitionStatistics == null) { return OptionalDouble.empty(); } OptionalLong rowCount = partitionStatistics.getBasicStatistics().getRowCount(); if (rowCount.isPresent()) { verify(rowCount.getAsLong() >= 0, "rowCount must be greater than or equal to zero"); return OptionalDouble.of(rowCount.getAsLong()); } return OptionalDouble.empty(); } @VisibleForTesting static Optional<DoubleRange> calculateRangeForPartitioningKey(HiveColumnHandle column, Type type, List<HivePartition> partitions) { if (!isRangeSupported(type)) { return Optional.empty(); } List<Double> values = partitions.stream().map(HivePartition::getKeys).map(keys -> keys.get(column)) .filter(value -> !value.isNull()).map(NullableValue::getValue) .map(value -> convertPartitionValueToDouble(type, value)).collect(toImmutableList()); if (values.isEmpty()) { return Optional.empty(); } double min = values.get(0); double max = values.get(0); for (Double value : values) { if (value > max) { max = value; } if (value < min) { min = value; } } return Optional.of(new DoubleRange(min, max)); } @VisibleForTesting static double convertPartitionValueToDouble(Type type, Object value) { if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT)) { return (Long) value; } if (type.equals(DOUBLE)) { return (Double) value; } if (type.equals(REAL)) { return intBitsToFloat(((Long) value).intValue()); } if (type instanceof DecimalType) { DecimalType decimalType = (DecimalType) type; if (isShortDecimal(decimalType)) { return parseDouble(Decimals.toString((Long) value, decimalType.getScale())); } if (isLongDecimal(decimalType)) { return parseDouble(Decimals.toString((Slice) value, decimalType.getScale())); } throw new IllegalArgumentException("Unexpected decimal type: " + decimalType); } if (type.equals(DATE)) { return (Long) value; } throw new IllegalArgumentException("Unexpected type: " + type); } @VisibleForTesting static ColumnStatistics createDataColumnStatistics(String column, Type type, double rowsCount, Collection<PartitionStatistics> partitionStatistics) { List<HiveColumnStatistics> columnStatistics = partitionStatistics.stream() .map(PartitionStatistics::getColumnStatistics).map(statistics -> statistics.get(column)) .filter(Objects::nonNull).collect(toImmutableList()); if (columnStatistics.isEmpty()) { return ColumnStatistics.empty(); } return ColumnStatistics.builder().setDistinctValuesCount(calculateDistinctValuesCount(columnStatistics)) .setNullsFraction(calculateNullsFraction(column, partitionStatistics)) .setDataSize(calculateDataSize(column, partitionStatistics, rowsCount)) .setRange(calculateRange(type, columnStatistics)).build(); } @VisibleForTesting static Estimate calculateDistinctValuesCount(List<HiveColumnStatistics> columnStatistics) { return columnStatistics.stream().map(MetastoreHiveStatisticsProvider::getDistinctValuesCount) .filter(OptionalLong::isPresent).map(OptionalLong::getAsLong) .peek(distinctValuesCount -> verify(distinctValuesCount >= 0, "distinctValuesCount must be greater than or equal to zero")) .max(Long::compare).map(Estimate::of).orElse(Estimate.unknown()); } private static OptionalLong getDistinctValuesCount(HiveColumnStatistics statistics) { if (statistics.getBooleanStatistics().isPresent() && statistics.getBooleanStatistics().get().getFalseCount().isPresent() && statistics.getBooleanStatistics().get().getTrueCount().isPresent()) { long falseCount = statistics.getBooleanStatistics().get().getFalseCount().getAsLong(); long trueCount = statistics.getBooleanStatistics().get().getTrueCount().getAsLong(); return OptionalLong.of((falseCount > 0 ? 1 : 0) + (trueCount > 0 ? 1 : 0)); } if (statistics.getDistinctValuesCount().isPresent()) { return statistics.getDistinctValuesCount(); } return OptionalLong.empty(); } @VisibleForTesting static Estimate calculateNullsFraction(String column, Collection<PartitionStatistics> partitionStatistics) { List<PartitionStatistics> statisticsWithKnownRowCountAndNullsCount = partitionStatistics.stream() .filter(statistics -> { if (!statistics.getBasicStatistics().getRowCount().isPresent()) { return false; } HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column); if (columnStatistics == null) { return false; } return columnStatistics.getNullsCount().isPresent(); }).collect(toImmutableList()); if (statisticsWithKnownRowCountAndNullsCount.isEmpty()) { return Estimate.unknown(); } long totalNullsCount = 0; long totalRowCount = 0; for (PartitionStatistics statistics : statisticsWithKnownRowCountAndNullsCount) { long rowCount = statistics.getBasicStatistics().getRowCount() .orElseThrow(() -> new VerifyException("rowCount is not present")); verify(rowCount >= 0, "rowCount must be greater than or equal to zero"); HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column); verify(columnStatistics != null, "columnStatistics is null"); long nullsCount = columnStatistics.getNullsCount() .orElseThrow(() -> new VerifyException("nullsCount is not present")); verify(nullsCount >= 0, "nullsCount must be greater than or equal to zero"); verify(nullsCount <= rowCount, "nullsCount must be less than or equal to rowCount. nullsCount: %s. rowCount: %s.", nullsCount, rowCount); totalNullsCount += nullsCount; totalRowCount += rowCount; } if (totalRowCount == 0) { return Estimate.zero(); } verify(totalNullsCount <= totalRowCount, "totalNullsCount must be less than or equal to totalRowCount. totalNullsCount: %s. totalRowCount: %s.", totalNullsCount, totalRowCount); return Estimate.of(((double) totalNullsCount) / totalRowCount); } @VisibleForTesting static Estimate calculateDataSize(String column, Collection<PartitionStatistics> partitionStatistics, double totalRowCount) { List<PartitionStatistics> statisticsWithKnownRowCountAndDataSize = partitionStatistics.stream() .filter(statistics -> { if (!statistics.getBasicStatistics().getRowCount().isPresent()) { return false; } HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column); if (columnStatistics == null) { return false; } return columnStatistics.getTotalSizeInBytes().isPresent(); }).collect(toImmutableList()); if (statisticsWithKnownRowCountAndDataSize.isEmpty()) { return Estimate.unknown(); } long knownRowCount = 0; long knownDataSize = 0; for (PartitionStatistics statistics : statisticsWithKnownRowCountAndDataSize) { long rowCount = statistics.getBasicStatistics().getRowCount() .orElseThrow(() -> new VerifyException("rowCount is not present")); verify(rowCount >= 0, "rowCount must be greater than or equal to zero"); HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column); verify(columnStatistics != null, "columnStatistics is null"); long dataSize = columnStatistics.getTotalSizeInBytes() .orElseThrow(() -> new VerifyException("totalSizeInBytes is not present")); verify(dataSize >= 0, "dataSize must be greater than or equal to zero"); knownRowCount += rowCount; knownDataSize += dataSize; } if (totalRowCount == 0) { return Estimate.zero(); } if (knownRowCount == 0) { return Estimate.unknown(); } double averageValueDataSizeInBytes = ((double) knownDataSize) / knownRowCount; return Estimate.of(averageValueDataSizeInBytes * totalRowCount); } @VisibleForTesting static Optional<DoubleRange> calculateRange(Type type, List<HiveColumnStatistics> columnStatistics) { if (!isRangeSupported(type)) { return Optional.empty(); } return columnStatistics.stream().map(statistics -> createRange(type, statistics)) .filter(Optional::isPresent).map(Optional::get).reduce(DoubleRange::union); } private static boolean isRangeSupported(Type type) { return type.equals(TINYINT) || type.equals(SMALLINT) || type.equals(INTEGER) || type.equals(BIGINT) || type.equals(REAL) || type.equals(DOUBLE) || type.equals(DATE) || type instanceof DecimalType; } private static Optional<DoubleRange> createRange(Type type, HiveColumnStatistics statistics) { if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT)) { return statistics.getIntegerStatistics() .flatMap(integerStatistics -> createIntegerRange(type, integerStatistics)); } if (type.equals(DOUBLE) || type.equals(REAL)) { return statistics.getDoubleStatistics().flatMap(MetastoreHiveStatisticsProvider::createDoubleRange); } if (type.equals(DATE)) { return statistics.getDateStatistics().flatMap(MetastoreHiveStatisticsProvider::createDateRange); } if (type instanceof DecimalType) { return statistics.getDecimalStatistics().flatMap(MetastoreHiveStatisticsProvider::createDecimalRange); } throw new IllegalArgumentException("Unexpected type: " + type); } private static Optional<DoubleRange> createIntegerRange(Type type, IntegerStatistics statistics) { if (statistics.getMin().isPresent() && statistics.getMax().isPresent()) { return Optional .of(createIntegerRange(type, statistics.getMin().getAsLong(), statistics.getMax().getAsLong())); } return Optional.empty(); } private static DoubleRange createIntegerRange(Type type, long min, long max) { return new DoubleRange(normalizeIntegerValue(type, min), normalizeIntegerValue(type, max)); } private static long normalizeIntegerValue(Type type, long value) { if (type.equals(BIGINT)) { return value; } if (type.equals(INTEGER)) { return Ints.saturatedCast(value); } if (type.equals(SMALLINT)) { return Shorts.saturatedCast(value); } if (type.equals(TINYINT)) { return SignedBytes.saturatedCast(value); } throw new IllegalArgumentException("Unexpected type: " + type); } private static Optional<DoubleRange> createDoubleRange(DoubleStatistics statistics) { if (statistics.getMin().isPresent() && statistics.getMax().isPresent() && !isNaN(statistics.getMin().getAsDouble()) && !isNaN(statistics.getMax().getAsDouble())) { return Optional .of(new DoubleRange(statistics.getMin().getAsDouble(), statistics.getMax().getAsDouble())); } return Optional.empty(); } private static Optional<DoubleRange> createDateRange(DateStatistics statistics) { if (statistics.getMin().isPresent() && statistics.getMax().isPresent()) { return Optional.of(new DoubleRange(statistics.getMin().get().toEpochDay(), statistics.getMax().get().toEpochDay())); } return Optional.empty(); } private static Optional<DoubleRange> createDecimalRange(DecimalStatistics statistics) { if (statistics.getMin().isPresent() && statistics.getMax().isPresent()) { return Optional.of(new DoubleRange(statistics.getMin().get().doubleValue(), statistics.getMax().get().doubleValue())); } return Optional.empty(); } @VisibleForTesting interface PartitionsStatisticsProvider { Map<String, PartitionStatistics> getPartitionsStatistics(SchemaTableName table, List<HivePartition> hivePartitions); } }