Java tutorial
/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io.prestosql.plugin.hive; import com.google.common.base.Predicates; import com.google.common.base.VerifyException; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; import io.airlift.slice.Slice; import io.prestosql.plugin.hive.HiveBucketing.HiveBucketFilter; import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; import io.prestosql.plugin.hive.metastore.Table; import io.prestosql.spi.PrestoException; import io.prestosql.spi.connector.ColumnHandle; import io.prestosql.spi.connector.ConnectorTableHandle; import io.prestosql.spi.connector.Constraint; import io.prestosql.spi.connector.SchemaTableName; import io.prestosql.spi.connector.TableNotFoundException; import io.prestosql.spi.predicate.Domain; import io.prestosql.spi.predicate.NullableValue; import io.prestosql.spi.predicate.TupleDomain; import io.prestosql.spi.predicate.ValueSet; import io.prestosql.spi.type.BigintType; import io.prestosql.spi.type.BooleanType; import io.prestosql.spi.type.CharType; import io.prestosql.spi.type.DateType; import io.prestosql.spi.type.DecimalType; import io.prestosql.spi.type.Decimals; import io.prestosql.spi.type.DoubleType; import io.prestosql.spi.type.IntegerType; import io.prestosql.spi.type.RealType; import io.prestosql.spi.type.SmallintType; import io.prestosql.spi.type.TimestampType; import io.prestosql.spi.type.TinyintType; import io.prestosql.spi.type.Type; import io.prestosql.spi.type.TypeManager; import io.prestosql.spi.type.VarcharType; import org.apache.hadoop.hive.common.FileUtils; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormatter; import org.joda.time.format.ISODateTimeFormat; import javax.inject.Inject; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.concurrent.TimeUnit; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Predicates.not; import static com.google.common.collect.ImmutableList.toImmutableList; import static io.prestosql.plugin.hive.HiveBucketing.getHiveBucketFilter; import static io.prestosql.plugin.hive.HiveBucketing.getHiveBucketHandle; import static io.prestosql.plugin.hive.HiveUtil.getPartitionKeyColumnHandles; import static io.prestosql.plugin.hive.HiveUtil.parsePartitionValue; import static io.prestosql.plugin.hive.metastore.MetastoreUtil.getProtectMode; import static io.prestosql.plugin.hive.metastore.MetastoreUtil.makePartName; import static io.prestosql.plugin.hive.metastore.MetastoreUtil.verifyOnline; import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; import static io.prestosql.spi.connector.Constraint.alwaysTrue; import static io.prestosql.spi.predicate.TupleDomain.all; import static io.prestosql.spi.predicate.TupleDomain.none; import static io.prestosql.spi.type.Chars.padSpaces; import static java.lang.String.format; import static java.util.Objects.requireNonNull; import static java.util.stream.Collectors.toList; public class HivePartitionManager { private static final String PARTITION_VALUE_WILDCARD = ""; private final DateTimeZone timeZone; private final boolean assumeCanonicalPartitionKeys; private final int domainCompactionThreshold; private final TypeManager typeManager; @Inject public HivePartitionManager(TypeManager typeManager, HiveClientConfig hiveClientConfig) { this(typeManager, hiveClientConfig.getDateTimeZone(), hiveClientConfig.isAssumeCanonicalPartitionKeys(), hiveClientConfig.getDomainCompactionThreshold()); } public HivePartitionManager(TypeManager typeManager, DateTimeZone timeZone, boolean assumeCanonicalPartitionKeys, int domainCompactionThreshold) { this.timeZone = requireNonNull(timeZone, "timeZone is null"); this.assumeCanonicalPartitionKeys = assumeCanonicalPartitionKeys; checkArgument(domainCompactionThreshold >= 1, "domainCompactionThreshold must be at least 1"); this.domainCompactionThreshold = domainCompactionThreshold; this.typeManager = requireNonNull(typeManager, "typeManager is null"); } public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastore, ConnectorTableHandle tableHandle, Constraint<ColumnHandle> constraint) { HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle; TupleDomain<ColumnHandle> effectivePredicate = constraint.getSummary(); SchemaTableName tableName = hiveTableHandle.getSchemaTableName(); Table table = getTable(metastore, tableName); Optional<HiveBucketHandle> hiveBucketHandle = getHiveBucketHandle(table); List<HiveColumnHandle> partitionColumns = getPartitionKeyColumnHandles(table); if (effectivePredicate.isNone()) { return new HivePartitionResult(partitionColumns, ImmutableList.of(), none(), none(), none(), hiveBucketHandle, Optional.empty()); } Optional<HiveBucketFilter> bucketFilter = getHiveBucketFilter(table, effectivePredicate); TupleDomain<HiveColumnHandle> compactEffectivePredicate = toCompactTupleDomain(effectivePredicate, domainCompactionThreshold); if (partitionColumns.isEmpty()) { return new HivePartitionResult(partitionColumns, ImmutableList.of(new HivePartition(tableName)), compactEffectivePredicate, effectivePredicate, none(), hiveBucketHandle, bucketFilter); } List<Type> partitionTypes = partitionColumns.stream() .map(column -> typeManager.getType(column.getTypeSignature())).collect(toList()); List<String> partitionNames = getFilteredPartitionNames(metastore, tableName, partitionColumns, effectivePredicate); Iterable<HivePartition> partitionsIterable = () -> partitionNames.stream() // Apply extra filters which could not be done by getFilteredPartitionNames .map(partitionName -> parseValuesAndFilterPartition(tableName, partitionName, partitionColumns, partitionTypes, constraint)) .filter(Optional::isPresent).map(Optional::get).iterator(); // All partition key domains will be fully evaluated, so we don't need to include those TupleDomain<ColumnHandle> remainingTupleDomain = TupleDomain.withColumnDomains( Maps.filterKeys(effectivePredicate.getDomains().get(), not(Predicates.in(partitionColumns)))); TupleDomain<ColumnHandle> enforcedTupleDomain = TupleDomain.withColumnDomains( Maps.filterKeys(effectivePredicate.getDomains().get(), Predicates.in(partitionColumns))); return new HivePartitionResult(partitionColumns, partitionsIterable, compactEffectivePredicate, remainingTupleDomain, enforcedTupleDomain, hiveBucketHandle, bucketFilter); } public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastore, ConnectorTableHandle tableHandle, List<List<String>> partitionValuesList) { HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle; SchemaTableName tableName = hiveTableHandle.getSchemaTableName(); Table table = getTable(metastore, tableName); List<HiveColumnHandle> partitionColumns = getPartitionKeyColumnHandles(table); List<Type> partitionColumnTypes = partitionColumns.stream() .map(column -> typeManager.getType(column.getTypeSignature())).collect(toImmutableList()); List<HivePartition> partitionList = partitionValuesList.stream() .map(partitionValues -> makePartName(table.getPartitionColumns(), partitionValues)) .map(partitionName -> parseValuesAndFilterPartition(tableName, partitionName, partitionColumns, partitionColumnTypes, alwaysTrue())) .map(partition -> partition.orElseThrow(() -> new VerifyException("partition must exist"))) .collect(toImmutableList()); return new HivePartitionResult(partitionColumns, partitionList, all(), all(), none(), getHiveBucketHandle(table), Optional.empty()); } private static TupleDomain<HiveColumnHandle> toCompactTupleDomain(TupleDomain<ColumnHandle> effectivePredicate, int threshold) { ImmutableMap.Builder<HiveColumnHandle, Domain> builder = ImmutableMap.builder(); effectivePredicate.getDomains().ifPresent(domains -> { for (Map.Entry<ColumnHandle, Domain> entry : domains.entrySet()) { HiveColumnHandle hiveColumnHandle = (HiveColumnHandle) entry.getKey(); ValueSet values = entry.getValue().getValues(); ValueSet compactValueSet = values.getValuesProcessor() .<Optional<ValueSet>>transform( ranges -> ranges.getRangeCount() > threshold ? Optional.of(ValueSet.ofRanges(ranges.getSpan())) : Optional.empty(), discreteValues -> discreteValues.getValues().size() > threshold ? Optional.of(ValueSet.all(values.getType())) : Optional.empty(), allOrNone -> Optional.empty()) .orElse(values); builder.put(hiveColumnHandle, Domain.create(compactValueSet, entry.getValue().isNullAllowed())); } }); return TupleDomain.withColumnDomains(builder.build()); } private Optional<HivePartition> parseValuesAndFilterPartition(SchemaTableName tableName, String partitionId, List<HiveColumnHandle> partitionColumns, List<Type> partitionColumnTypes, Constraint<ColumnHandle> constraint) { HivePartition partition = parsePartition(tableName, partitionId, partitionColumns, partitionColumnTypes, timeZone); Map<ColumnHandle, Domain> domains = constraint.getSummary().getDomains().get(); for (HiveColumnHandle column : partitionColumns) { NullableValue value = partition.getKeys().get(column); Domain allowedDomain = domains.get(column); if (allowedDomain != null && !allowedDomain.includesNullableValue(value.getValue())) { return Optional.empty(); } } if (constraint.predicate().isPresent() && !constraint.predicate().get().test(partition.getKeys())) { return Optional.empty(); } return Optional.of(partition); } private Table getTable(SemiTransactionalHiveMetastore metastore, SchemaTableName tableName) { Optional<Table> target = metastore.getTable(tableName.getSchemaName(), tableName.getTableName()); if (!target.isPresent()) { throw new TableNotFoundException(tableName); } Table table = target.get(); verifyOnline(tableName, Optional.empty(), getProtectMode(table), table.getParameters()); return table; } private List<String> getFilteredPartitionNames(SemiTransactionalHiveMetastore metastore, SchemaTableName tableName, List<HiveColumnHandle> partitionKeys, TupleDomain<ColumnHandle> effectivePredicate) { checkArgument(effectivePredicate.getDomains().isPresent()); List<String> filter = new ArrayList<>(); for (HiveColumnHandle partitionKey : partitionKeys) { Domain domain = effectivePredicate.getDomains().get().get(partitionKey); if (domain != null && domain.isNullableSingleValue()) { Object value = domain.getNullableSingleValue(); Type type = domain.getType(); if (value == null) { filter.add(HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION); } else if (type instanceof CharType) { Slice slice = (Slice) value; filter.add(padSpaces(slice, type).toStringUtf8()); } else if (type instanceof VarcharType) { Slice slice = (Slice) value; filter.add(slice.toStringUtf8()); } // Types above this have only a single possible representation for each value. // Types below this may have multiple representations for a single value. For // example, a boolean column may represent the false value as "0", "false" or "False". // The metastore distinguishes between these representations, so we cannot prune partitions // unless we know that all partition values use the canonical Java representation. else if (!assumeCanonicalPartitionKeys) { filter.add(PARTITION_VALUE_WILDCARD); } else if (type instanceof DecimalType && !((DecimalType) type).isShort()) { Slice slice = (Slice) value; filter.add(Decimals.toString(slice, ((DecimalType) type).getScale())); } else if (type instanceof DecimalType && ((DecimalType) type).isShort()) { filter.add(Decimals.toString((long) value, ((DecimalType) type).getScale())); } else if (type instanceof DateType) { DateTimeFormatter dateTimeFormatter = ISODateTimeFormat.date().withZoneUTC(); filter.add(dateTimeFormatter.print(TimeUnit.DAYS.toMillis((long) value))); } else if (type instanceof TimestampType) { // we don't have time zone info, so just add a wildcard filter.add(PARTITION_VALUE_WILDCARD); } else if (type instanceof TinyintType || type instanceof SmallintType || type instanceof IntegerType || type instanceof BigintType || type instanceof DoubleType || type instanceof RealType || type instanceof BooleanType) { filter.add(value.toString()); } else { throw new PrestoException(NOT_SUPPORTED, format("Unsupported partition key type: %s", type.getDisplayName())); } } else { filter.add(PARTITION_VALUE_WILDCARD); } } // fetch the partition names return metastore.getPartitionNamesByParts(tableName.getSchemaName(), tableName.getTableName(), filter) .orElseThrow(() -> new TableNotFoundException(tableName)); } public static HivePartition parsePartition(SchemaTableName tableName, String partitionName, List<HiveColumnHandle> partitionColumns, List<Type> partitionColumnTypes, DateTimeZone timeZone) { List<String> partitionValues = extractPartitionValues(partitionName); ImmutableMap.Builder<ColumnHandle, NullableValue> builder = ImmutableMap.builder(); for (int i = 0; i < partitionColumns.size(); i++) { HiveColumnHandle column = partitionColumns.get(i); NullableValue parsedValue = parsePartitionValue(partitionName, partitionValues.get(i), partitionColumnTypes.get(i), timeZone); builder.put(column, parsedValue); } Map<ColumnHandle, NullableValue> values = builder.build(); return new HivePartition(tableName, partitionName, values); } public static List<String> extractPartitionValues(String partitionName) { ImmutableList.Builder<String> values = ImmutableList.builder(); boolean inKey = true; int valueStart = -1; for (int i = 0; i < partitionName.length(); i++) { char current = partitionName.charAt(i); if (inKey) { checkArgument(current != '/', "Invalid partition spec: %s", partitionName); if (current == '=') { inKey = false; valueStart = i + 1; } } else if (current == '/') { checkArgument(valueStart != -1, "Invalid partition spec: %s", partitionName); values.add(FileUtils.unescapePathName(partitionName.substring(valueStart, i))); inKey = true; valueStart = -1; } } checkArgument(!inKey, "Invalid partition spec: %s", partitionName); values.add(FileUtils.unescapePathName(partitionName.substring(valueStart, partitionName.length()))); return values.build(); } }