io.prestosql.plugin.hive.HivePartitionManager.java Source code

Java tutorial

Introduction

Here is the source code for io.prestosql.plugin.hive.HivePartitionManager.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.prestosql.plugin.hive;

import com.google.common.base.Predicates;
import com.google.common.base.VerifyException;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import io.airlift.slice.Slice;
import io.prestosql.plugin.hive.HiveBucketing.HiveBucketFilter;
import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore;
import io.prestosql.plugin.hive.metastore.Table;
import io.prestosql.spi.PrestoException;
import io.prestosql.spi.connector.ColumnHandle;
import io.prestosql.spi.connector.ConnectorTableHandle;
import io.prestosql.spi.connector.Constraint;
import io.prestosql.spi.connector.SchemaTableName;
import io.prestosql.spi.connector.TableNotFoundException;
import io.prestosql.spi.predicate.Domain;
import io.prestosql.spi.predicate.NullableValue;
import io.prestosql.spi.predicate.TupleDomain;
import io.prestosql.spi.predicate.ValueSet;
import io.prestosql.spi.type.BigintType;
import io.prestosql.spi.type.BooleanType;
import io.prestosql.spi.type.CharType;
import io.prestosql.spi.type.DateType;
import io.prestosql.spi.type.DecimalType;
import io.prestosql.spi.type.Decimals;
import io.prestosql.spi.type.DoubleType;
import io.prestosql.spi.type.IntegerType;
import io.prestosql.spi.type.RealType;
import io.prestosql.spi.type.SmallintType;
import io.prestosql.spi.type.TimestampType;
import io.prestosql.spi.type.TinyintType;
import io.prestosql.spi.type.Type;
import io.prestosql.spi.type.TypeManager;
import io.prestosql.spi.type.VarcharType;
import org.apache.hadoop.hive.common.FileUtils;
import org.joda.time.DateTimeZone;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;

import javax.inject.Inject;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.TimeUnit;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Predicates.not;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static io.prestosql.plugin.hive.HiveBucketing.getHiveBucketFilter;
import static io.prestosql.plugin.hive.HiveBucketing.getHiveBucketHandle;
import static io.prestosql.plugin.hive.HiveUtil.getPartitionKeyColumnHandles;
import static io.prestosql.plugin.hive.HiveUtil.parsePartitionValue;
import static io.prestosql.plugin.hive.metastore.MetastoreUtil.getProtectMode;
import static io.prestosql.plugin.hive.metastore.MetastoreUtil.makePartName;
import static io.prestosql.plugin.hive.metastore.MetastoreUtil.verifyOnline;
import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED;
import static io.prestosql.spi.connector.Constraint.alwaysTrue;
import static io.prestosql.spi.predicate.TupleDomain.all;
import static io.prestosql.spi.predicate.TupleDomain.none;
import static io.prestosql.spi.type.Chars.padSpaces;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static java.util.stream.Collectors.toList;

public class HivePartitionManager {
    private static final String PARTITION_VALUE_WILDCARD = "";

    private final DateTimeZone timeZone;
    private final boolean assumeCanonicalPartitionKeys;
    private final int domainCompactionThreshold;
    private final TypeManager typeManager;

    @Inject
    public HivePartitionManager(TypeManager typeManager, HiveClientConfig hiveClientConfig) {
        this(typeManager, hiveClientConfig.getDateTimeZone(), hiveClientConfig.isAssumeCanonicalPartitionKeys(),
                hiveClientConfig.getDomainCompactionThreshold());
    }

    public HivePartitionManager(TypeManager typeManager, DateTimeZone timeZone,
            boolean assumeCanonicalPartitionKeys, int domainCompactionThreshold) {
        this.timeZone = requireNonNull(timeZone, "timeZone is null");
        this.assumeCanonicalPartitionKeys = assumeCanonicalPartitionKeys;
        checkArgument(domainCompactionThreshold >= 1, "domainCompactionThreshold must be at least 1");
        this.domainCompactionThreshold = domainCompactionThreshold;
        this.typeManager = requireNonNull(typeManager, "typeManager is null");
    }

    public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastore,
            ConnectorTableHandle tableHandle, Constraint<ColumnHandle> constraint) {
        HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle;
        TupleDomain<ColumnHandle> effectivePredicate = constraint.getSummary();

        SchemaTableName tableName = hiveTableHandle.getSchemaTableName();
        Table table = getTable(metastore, tableName);
        Optional<HiveBucketHandle> hiveBucketHandle = getHiveBucketHandle(table);

        List<HiveColumnHandle> partitionColumns = getPartitionKeyColumnHandles(table);

        if (effectivePredicate.isNone()) {
            return new HivePartitionResult(partitionColumns, ImmutableList.of(), none(), none(), none(),
                    hiveBucketHandle, Optional.empty());
        }

        Optional<HiveBucketFilter> bucketFilter = getHiveBucketFilter(table, effectivePredicate);
        TupleDomain<HiveColumnHandle> compactEffectivePredicate = toCompactTupleDomain(effectivePredicate,
                domainCompactionThreshold);

        if (partitionColumns.isEmpty()) {
            return new HivePartitionResult(partitionColumns, ImmutableList.of(new HivePartition(tableName)),
                    compactEffectivePredicate, effectivePredicate, none(), hiveBucketHandle, bucketFilter);
        }

        List<Type> partitionTypes = partitionColumns.stream()
                .map(column -> typeManager.getType(column.getTypeSignature())).collect(toList());

        List<String> partitionNames = getFilteredPartitionNames(metastore, tableName, partitionColumns,
                effectivePredicate);

        Iterable<HivePartition> partitionsIterable = () -> partitionNames.stream()
                // Apply extra filters which could not be done by getFilteredPartitionNames
                .map(partitionName -> parseValuesAndFilterPartition(tableName, partitionName, partitionColumns,
                        partitionTypes, constraint))
                .filter(Optional::isPresent).map(Optional::get).iterator();

        // All partition key domains will be fully evaluated, so we don't need to include those
        TupleDomain<ColumnHandle> remainingTupleDomain = TupleDomain.withColumnDomains(
                Maps.filterKeys(effectivePredicate.getDomains().get(), not(Predicates.in(partitionColumns))));
        TupleDomain<ColumnHandle> enforcedTupleDomain = TupleDomain.withColumnDomains(
                Maps.filterKeys(effectivePredicate.getDomains().get(), Predicates.in(partitionColumns)));
        return new HivePartitionResult(partitionColumns, partitionsIterable, compactEffectivePredicate,
                remainingTupleDomain, enforcedTupleDomain, hiveBucketHandle, bucketFilter);
    }

    public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastore,
            ConnectorTableHandle tableHandle, List<List<String>> partitionValuesList) {
        HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle;
        SchemaTableName tableName = hiveTableHandle.getSchemaTableName();

        Table table = getTable(metastore, tableName);

        List<HiveColumnHandle> partitionColumns = getPartitionKeyColumnHandles(table);
        List<Type> partitionColumnTypes = partitionColumns.stream()
                .map(column -> typeManager.getType(column.getTypeSignature())).collect(toImmutableList());

        List<HivePartition> partitionList = partitionValuesList.stream()
                .map(partitionValues -> makePartName(table.getPartitionColumns(), partitionValues))
                .map(partitionName -> parseValuesAndFilterPartition(tableName, partitionName, partitionColumns,
                        partitionColumnTypes, alwaysTrue()))
                .map(partition -> partition.orElseThrow(() -> new VerifyException("partition must exist")))
                .collect(toImmutableList());

        return new HivePartitionResult(partitionColumns, partitionList, all(), all(), none(),
                getHiveBucketHandle(table), Optional.empty());
    }

    private static TupleDomain<HiveColumnHandle> toCompactTupleDomain(TupleDomain<ColumnHandle> effectivePredicate,
            int threshold) {
        ImmutableMap.Builder<HiveColumnHandle, Domain> builder = ImmutableMap.builder();
        effectivePredicate.getDomains().ifPresent(domains -> {
            for (Map.Entry<ColumnHandle, Domain> entry : domains.entrySet()) {
                HiveColumnHandle hiveColumnHandle = (HiveColumnHandle) entry.getKey();

                ValueSet values = entry.getValue().getValues();
                ValueSet compactValueSet = values.getValuesProcessor()
                        .<Optional<ValueSet>>transform(
                                ranges -> ranges.getRangeCount() > threshold
                                        ? Optional.of(ValueSet.ofRanges(ranges.getSpan()))
                                        : Optional.empty(),
                                discreteValues -> discreteValues.getValues().size() > threshold
                                        ? Optional.of(ValueSet.all(values.getType()))
                                        : Optional.empty(),
                                allOrNone -> Optional.empty())
                        .orElse(values);
                builder.put(hiveColumnHandle, Domain.create(compactValueSet, entry.getValue().isNullAllowed()));
            }
        });
        return TupleDomain.withColumnDomains(builder.build());
    }

    private Optional<HivePartition> parseValuesAndFilterPartition(SchemaTableName tableName, String partitionId,
            List<HiveColumnHandle> partitionColumns, List<Type> partitionColumnTypes,
            Constraint<ColumnHandle> constraint) {
        HivePartition partition = parsePartition(tableName, partitionId, partitionColumns, partitionColumnTypes,
                timeZone);

        Map<ColumnHandle, Domain> domains = constraint.getSummary().getDomains().get();
        for (HiveColumnHandle column : partitionColumns) {
            NullableValue value = partition.getKeys().get(column);
            Domain allowedDomain = domains.get(column);
            if (allowedDomain != null && !allowedDomain.includesNullableValue(value.getValue())) {
                return Optional.empty();
            }
        }

        if (constraint.predicate().isPresent() && !constraint.predicate().get().test(partition.getKeys())) {
            return Optional.empty();
        }

        return Optional.of(partition);
    }

    private Table getTable(SemiTransactionalHiveMetastore metastore, SchemaTableName tableName) {
        Optional<Table> target = metastore.getTable(tableName.getSchemaName(), tableName.getTableName());
        if (!target.isPresent()) {
            throw new TableNotFoundException(tableName);
        }
        Table table = target.get();
        verifyOnline(tableName, Optional.empty(), getProtectMode(table), table.getParameters());
        return table;
    }

    private List<String> getFilteredPartitionNames(SemiTransactionalHiveMetastore metastore,
            SchemaTableName tableName, List<HiveColumnHandle> partitionKeys,
            TupleDomain<ColumnHandle> effectivePredicate) {
        checkArgument(effectivePredicate.getDomains().isPresent());

        List<String> filter = new ArrayList<>();
        for (HiveColumnHandle partitionKey : partitionKeys) {
            Domain domain = effectivePredicate.getDomains().get().get(partitionKey);
            if (domain != null && domain.isNullableSingleValue()) {
                Object value = domain.getNullableSingleValue();
                Type type = domain.getType();
                if (value == null) {
                    filter.add(HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION);
                } else if (type instanceof CharType) {
                    Slice slice = (Slice) value;
                    filter.add(padSpaces(slice, type).toStringUtf8());
                } else if (type instanceof VarcharType) {
                    Slice slice = (Slice) value;
                    filter.add(slice.toStringUtf8());
                }
                // Types above this have only a single possible representation for each value.
                // Types below this may have multiple representations for a single value.  For
                // example, a boolean column may represent the false value as "0", "false" or "False".
                // The metastore distinguishes between these representations, so we cannot prune partitions
                // unless we know that all partition values use the canonical Java representation.
                else if (!assumeCanonicalPartitionKeys) {
                    filter.add(PARTITION_VALUE_WILDCARD);
                } else if (type instanceof DecimalType && !((DecimalType) type).isShort()) {
                    Slice slice = (Slice) value;
                    filter.add(Decimals.toString(slice, ((DecimalType) type).getScale()));
                } else if (type instanceof DecimalType && ((DecimalType) type).isShort()) {
                    filter.add(Decimals.toString((long) value, ((DecimalType) type).getScale()));
                } else if (type instanceof DateType) {
                    DateTimeFormatter dateTimeFormatter = ISODateTimeFormat.date().withZoneUTC();
                    filter.add(dateTimeFormatter.print(TimeUnit.DAYS.toMillis((long) value)));
                } else if (type instanceof TimestampType) {
                    // we don't have time zone info, so just add a wildcard
                    filter.add(PARTITION_VALUE_WILDCARD);
                } else if (type instanceof TinyintType || type instanceof SmallintType
                        || type instanceof IntegerType || type instanceof BigintType || type instanceof DoubleType
                        || type instanceof RealType || type instanceof BooleanType) {
                    filter.add(value.toString());
                } else {
                    throw new PrestoException(NOT_SUPPORTED,
                            format("Unsupported partition key type: %s", type.getDisplayName()));
                }
            } else {
                filter.add(PARTITION_VALUE_WILDCARD);
            }
        }

        // fetch the partition names
        return metastore.getPartitionNamesByParts(tableName.getSchemaName(), tableName.getTableName(), filter)
                .orElseThrow(() -> new TableNotFoundException(tableName));
    }

    public static HivePartition parsePartition(SchemaTableName tableName, String partitionName,
            List<HiveColumnHandle> partitionColumns, List<Type> partitionColumnTypes, DateTimeZone timeZone) {
        List<String> partitionValues = extractPartitionValues(partitionName);
        ImmutableMap.Builder<ColumnHandle, NullableValue> builder = ImmutableMap.builder();
        for (int i = 0; i < partitionColumns.size(); i++) {
            HiveColumnHandle column = partitionColumns.get(i);
            NullableValue parsedValue = parsePartitionValue(partitionName, partitionValues.get(i),
                    partitionColumnTypes.get(i), timeZone);
            builder.put(column, parsedValue);
        }
        Map<ColumnHandle, NullableValue> values = builder.build();
        return new HivePartition(tableName, partitionName, values);
    }

    public static List<String> extractPartitionValues(String partitionName) {
        ImmutableList.Builder<String> values = ImmutableList.builder();

        boolean inKey = true;
        int valueStart = -1;
        for (int i = 0; i < partitionName.length(); i++) {
            char current = partitionName.charAt(i);
            if (inKey) {
                checkArgument(current != '/', "Invalid partition spec: %s", partitionName);
                if (current == '=') {
                    inKey = false;
                    valueStart = i + 1;
                }
            } else if (current == '/') {
                checkArgument(valueStart != -1, "Invalid partition spec: %s", partitionName);
                values.add(FileUtils.unescapePathName(partitionName.substring(valueStart, i)));
                inKey = true;
                valueStart = -1;
            }
        }
        checkArgument(!inKey, "Invalid partition spec: %s", partitionName);
        values.add(FileUtils.unescapePathName(partitionName.substring(valueStart, partitionName.length())));

        return values.build();
    }
}