Source code

Java tutorial


Here is the source code for


 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package io.prestosql.plugin.hive;

import io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext;
import io.prestosql.plugin.hive.HiveBucketing.HiveBucketFilter;
import io.prestosql.plugin.hive.HiveSplit.BucketConversion;
import io.prestosql.plugin.hive.metastore.Column;
import io.prestosql.plugin.hive.metastore.Partition;
import io.prestosql.plugin.hive.metastore.Table;
import io.prestosql.plugin.hive.util.HiveFileIterator;
import io.prestosql.plugin.hive.util.HiveFileIterator.NestedDirectoryNotAllowedException;
import io.prestosql.plugin.hive.util.InternalHiveSplitFactory;
import io.prestosql.plugin.hive.util.ResumableTask;
import io.prestosql.plugin.hive.util.ResumableTasks;
import io.prestosql.spi.PrestoException;
import io.prestosql.spi.connector.ColumnHandle;
import io.prestosql.spi.connector.ConnectorSession;
import io.prestosql.spi.connector.SchemaTableName;
import io.prestosql.spi.predicate.TupleDomain;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;

import java.lang.annotation.Annotation;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Deque;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.Properties;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.concurrent.Executor;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.IntPredicate;

import static;
import static;
import static;
import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_BAD_DATA;
import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR;
import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_INVALID_BUCKET_FILES;
import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA;
import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_INVALID_PARTITION_VALUE;
import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_UNKNOWN_ERROR;
import static io.prestosql.plugin.hive.HiveSessionProperties.isForceLocalScheduling;
import static io.prestosql.plugin.hive.HiveUtil.checkCondition;
import static io.prestosql.plugin.hive.HiveUtil.getFooterCount;
import static io.prestosql.plugin.hive.HiveUtil.getHeaderCount;
import static io.prestosql.plugin.hive.HiveUtil.getInputFormat;
import static io.prestosql.plugin.hive.S3SelectPushdown.shouldEnablePushdownForTable;
import static io.prestosql.plugin.hive.metastore.MetastoreUtil.getHiveSchema;
import static io.prestosql.plugin.hive.metastore.MetastoreUtil.getPartitionLocation;
import static io.prestosql.plugin.hive.util.ConfigurationUtils.toJobConf;
import static io.prestosql.plugin.hive.util.HiveFileIterator.NestedDirectoryPolicy.FAIL;
import static io.prestosql.plugin.hive.util.HiveFileIterator.NestedDirectoryPolicy.IGNORED;
import static io.prestosql.plugin.hive.util.HiveFileIterator.NestedDirectoryPolicy.RECURSE;
import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static org.apache.hadoop.hive.common.FileUtils.HIDDEN_FILES_PATH_FILTER;

public class BackgroundHiveSplitLoader implements HiveSplitLoader {
    private static final ListenableFuture<?> COMPLETED_FUTURE = immediateFuture(null);

    private final Table table;
    private final TupleDomain<? extends ColumnHandle> compactEffectivePredicate;
    private final Optional<BucketSplitInfo> tableBucketInfo;
    private final HdfsEnvironment hdfsEnvironment;
    private final HdfsContext hdfsContext;
    private final NamenodeStats namenodeStats;
    private final DirectoryLister directoryLister;
    private final int loaderConcurrency;
    private final boolean recursiveDirWalkerEnabled;
    private final Executor executor;
    private final ConnectorSession session;
    private final ConcurrentLazyQueue<HivePartitionMetadata> partitions;
    private final Deque<Iterator<InternalHiveSplit>> fileIterators = new ConcurrentLinkedDeque<>();

    // Purpose of this lock:
    // * Write lock: when you need a consistent view across partitions, fileIterators, and hiveSplitSource.
    // * Read lock: when you need to modify any of the above.
    //   Make sure the lock is held throughout the period during which they may not be consistent with each other.
    // Details:
    // * When write lock is acquired, except the holder, no one can do any of the following:
    // ** poll from (or check empty) partitions
    // ** poll from (or check empty) or push to fileIterators
    // ** push to hiveSplitSource
    // * When any of the above three operations is carried out, either a read lock or a write lock must be held.
    // * When a series of operations involving two or more of the above three operations are carried out, the lock
    //   must be continuously held throughout the series of operations.
    // Implications:
    // * if you hold a read lock but not a write lock, you can do any of the above three operations, but you may
    //   see a series of operations involving two or more of the operations carried out half way.
    private final ReentrantReadWriteLock taskExecutionLock = new ReentrantReadWriteLock();

    private HiveSplitSource hiveSplitSource;
    private volatile boolean stopped;

    public BackgroundHiveSplitLoader(Table table, Iterable<HivePartitionMetadata> partitions,
            TupleDomain<? extends ColumnHandle> compactEffectivePredicate,
            Optional<BucketSplitInfo> tableBucketInfo, ConnectorSession session, HdfsEnvironment hdfsEnvironment,
            NamenodeStats namenodeStats, DirectoryLister directoryLister, Executor executor, int loaderConcurrency,
            boolean recursiveDirWalkerEnabled) {
        this.table = table;
        this.compactEffectivePredicate = compactEffectivePredicate;
        this.tableBucketInfo = tableBucketInfo;
        this.loaderConcurrency = loaderConcurrency;
        this.session = session;
        this.hdfsEnvironment = hdfsEnvironment;
        this.namenodeStats = namenodeStats;
        this.directoryLister = directoryLister;
        this.recursiveDirWalkerEnabled = recursiveDirWalkerEnabled;
        this.executor = executor;
        this.partitions = new ConcurrentLazyQueue<>(partitions);
        this.hdfsContext = new HdfsContext(session, table.getDatabaseName(), table.getTableName());

    public void start(HiveSplitSource splitSource) {
        this.hiveSplitSource = splitSource;
        for (int i = 0; i < loaderConcurrency; i++) {
            ResumableTasks.submit(executor, new HiveSplitLoaderTask());

    public void stop() {
        stopped = true;

    private class HiveSplitLoaderTask implements ResumableTask {
        public TaskStatus process() {
            while (true) {
                if (stopped) {
                    return TaskStatus.finished();
                ListenableFuture<?> future;
                try {
                    future = loadSplits();
                } catch (Exception e) {
                    if (e instanceof IOException) {
                        e = new PrestoException(HIVE_FILESYSTEM_ERROR, e);
                    } else if (!(e instanceof PrestoException)) {
                        e = new PrestoException(HIVE_UNKNOWN_ERROR, e);
                    // Fail the split source before releasing the execution lock
                    // Otherwise, a race could occur where the split source is completed before we fail it.
                    return TaskStatus.finished();
                } finally {
                if (!future.isDone()) {
                    return TaskStatus.continueOn(future);

    private void invokeNoMoreSplitsIfNecessary() {
        try {
            // This is an opportunistic check to avoid getting the write lock unnecessarily
            if (!partitions.isEmpty() || !fileIterators.isEmpty()) {
        } catch (Exception e) {
            checkState(stopped, "Task is not marked as stopped even though it failed");
        } finally {

        try {
            // the write lock guarantees that no one is operating on the partitions, fileIterators, or hiveSplitSource, or half way through doing so.
            if (partitions.isEmpty() && fileIterators.isEmpty()) {
                // It is legal to call `noMoreSplits` multiple times or after `stop` was called.
                // Nothing bad will happen if `noMoreSplits` implementation calls methods that will try to obtain a read lock because the lock is re-entrant.
        } catch (Exception e) {
            checkState(stopped, "Task is not marked as stopped even though it failed");
        } finally {

    private ListenableFuture<?> loadSplits() throws IOException {
        Iterator<InternalHiveSplit> splits = fileIterators.poll();
        if (splits == null) {
            HivePartitionMetadata partition = partitions.poll();
            if (partition == null) {
                return COMPLETED_FUTURE;
            return loadPartition(partition);

        while (splits.hasNext() && !stopped) {
            ListenableFuture<?> future = hiveSplitSource.addToQueue(;
            if (!future.isDone()) {
                return future;

        // No need to put the iterator back, since it's either empty or we've stopped
        return COMPLETED_FUTURE;

    private ListenableFuture<?> loadPartition(HivePartitionMetadata partition) throws IOException {
        String partitionName = partition.getHivePartition().getPartitionId();
        Properties schema = getPartitionSchema(table, partition.getPartition());
        List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition());
        TupleDomain<HiveColumnHandle> effectivePredicate = (TupleDomain<HiveColumnHandle>) compactEffectivePredicate;

        Path path = new Path(getPartitionLocation(table, partition.getPartition()));
        Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, path);
        InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
        FileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, path);
        boolean s3SelectPushdownEnabled = shouldEnablePushdownForTable(session, table, path.toString(),

        if (inputFormat instanceof SymlinkTextInputFormat) {
            if (tableBucketInfo.isPresent()) {
                throw new PrestoException(NOT_SUPPORTED,
                        "Bucketed table in SymlinkTextInputFormat is not yet supported");

            // TODO: This should use an iterator like the HiveFileIterator
            ListenableFuture<?> lastResult = COMPLETED_FUTURE;
            for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
                // The input should be in TextInputFormat.
                TextInputFormat targetInputFormat = new TextInputFormat();
                // the splits must be generated using the file system for the target path
                // get the configuration for the target path -- it may be a different hdfs instance
                FileSystem targetFilesystem = hdfsEnvironment.getFileSystem(hdfsContext, targetPath);
                JobConf targetJob = toJobConf(targetFilesystem.getConf());
                FileInputFormat.setInputPaths(targetJob, targetPath);
                InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);

                InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(targetFilesystem,
                        partitionName, inputFormat, schema, partitionKeys, effectivePredicate,
                        partition.getColumnCoercions(), Optional.empty(), isForceLocalScheduling(session),
                lastResult = addSplitsToSource(targetSplits, splitFactory);
                if (stopped) {
                    return COMPLETED_FUTURE;
            return lastResult;

        Optional<BucketConversion> bucketConversion = Optional.empty();
        boolean bucketConversionRequiresWorkerParticipation = false;
        if (partition.getPartition().isPresent()) {
            Optional<HiveBucketProperty> partitionBucketProperty = partition.getPartition().get().getStorage()
            if (tableBucketInfo.isPresent() && partitionBucketProperty.isPresent()) {
                int readBucketCount = tableBucketInfo.get().getReadBucketCount();
                int partitionBucketCount = partitionBucketProperty.get().getBucketCount();
                // Validation was done in HiveSplitManager#getPartitionMetadata.
                // Here, it's just trying to see if its needs the BucketConversion.
                if (readBucketCount != partitionBucketCount) {
                    bucketConversion = Optional.of(new BucketConversion(readBucketCount, partitionBucketCount,
                    if (readBucketCount > partitionBucketCount) {
                        bucketConversionRequiresWorkerParticipation = true;
        InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(fs, partitionName, inputFormat, schema,
                partitionKeys, effectivePredicate, partition.getColumnCoercions(),
                bucketConversionRequiresWorkerParticipation ? bucketConversion : Optional.empty(),
                isForceLocalScheduling(session), s3SelectPushdownEnabled);

        // To support custom input formats, we want to call getSplits()
        // on the input format to obtain file splits.
        if (shouldUseFileSplitsFromInputFormat(inputFormat)) {
            if (tableBucketInfo.isPresent()) {
                throw new PrestoException(NOT_SUPPORTED,
                        "Presto cannot read bucketed partition in an input format with UseFileSplitsFromInputFormat annotation: "
                                + inputFormat.getClass().getSimpleName());
            JobConf jobConf = toJobConf(configuration);
            FileInputFormat.setInputPaths(jobConf, path);
            InputSplit[] splits = inputFormat.getSplits(jobConf, 0);

            return addSplitsToSource(splits, splitFactory);

        // Bucketed partitions are fully loaded immediately since all files must be loaded to determine the file to bucket mapping
        if (tableBucketInfo.isPresent()) {
            return hiveSplitSource
                    .addToQueue(getBucketedSplits(path, fs, splitFactory, tableBucketInfo.get(), bucketConversion));

        // S3 Select pushdown works at the granularity of individual S3 objects,
        // therefore we must not split files when it is enabled.
        boolean splittable = getHeaderCount(schema) == 0 && getFooterCount(schema) == 0 && !s3SelectPushdownEnabled;
        fileIterators.addLast(createInternalHiveSplitIterator(path, fs, splitFactory, splittable));
        return COMPLETED_FUTURE;

    private ListenableFuture<?> addSplitsToSource(InputSplit[] targetSplits, InternalHiveSplitFactory splitFactory)
            throws IOException {
        ListenableFuture<?> lastResult = COMPLETED_FUTURE;
        for (InputSplit inputSplit : targetSplits) {
            Optional<InternalHiveSplit> internalHiveSplit = splitFactory
                    .createInternalHiveSplit((FileSplit) inputSplit);
            if (internalHiveSplit.isPresent()) {
                lastResult = hiveSplitSource.addToQueue(internalHiveSplit.get());
            if (stopped) {
                return COMPLETED_FUTURE;
        return lastResult;

    private static boolean shouldUseFileSplitsFromInputFormat(InputFormat<?, ?> inputFormat) {
                .map(Class::getSimpleName).anyMatch(name -> name.equals("UseFileSplitsFromInputFormat"));

    private Iterator<InternalHiveSplit> createInternalHiveSplitIterator(Path path, FileSystem fileSystem,
            InternalHiveSplitFactory splitFactory, boolean splittable) {
        return Streams
                .stream(new HiveFileIterator(path, fileSystem, directoryLister, namenodeStats,
                        recursiveDirWalkerEnabled ? RECURSE : IGNORED))
                .map(status -> splitFactory.createInternalHiveSplit(status, splittable)).filter(Optional::isPresent)

    private List<InternalHiveSplit> getBucketedSplits(Path path, FileSystem fileSystem,
            InternalHiveSplitFactory splitFactory, BucketSplitInfo bucketSplitInfo,
            Optional<BucketConversion> bucketConversion) {
        int readBucketCount = bucketSplitInfo.getReadBucketCount();
        int tableBucketCount = bucketSplitInfo.getTableBucketCount();
        int partitionBucketCount = bucketConversion.isPresent() ? bucketConversion.get().getPartitionBucketCount()
                : tableBucketCount;

        // list all files in the partition
        ArrayList<LocatedFileStatus> files = new ArrayList<>(partitionBucketCount);
        try {
            Iterators.addAll(files, new HiveFileIterator(path, fileSystem, directoryLister, namenodeStats, FAIL));
        } catch (NestedDirectoryNotAllowedException e) {
            // Fail here to be on the safe side. This seems to be the same as what Hive does
            throw new PrestoException(HIVE_INVALID_BUCKET_FILES,
                    format("Hive table '%s' is corrupt. Found sub-directory in bucket directory for partition: %s",
                            new SchemaTableName(table.getDatabaseName(), table.getTableName()),

        // verify we found one file per bucket
        if (files.size() != partitionBucketCount) {
            throw new PrestoException(HIVE_INVALID_BUCKET_FILES, format(
                    "Hive table '%s' is corrupt. The number of files in the directory (%s) does not match the declared bucket count (%s) for partition: %s",
                    new SchemaTableName(table.getDatabaseName(), table.getTableName()), files.size(),
                    partitionBucketCount, splitFactory.getPartitionName()));

        // Sort FileStatus objects (instead of, e.g., fileStatus.getPath().toString). This matches org.apache.hadoop.hive.ql.metadata.Table.getSortedPaths

        // convert files internal splits
        List<InternalHiveSplit> splitList = new ArrayList<>();
        for (int bucketNumber = 0; bucketNumber < Math.max(readBucketCount, partitionBucketCount); bucketNumber++) {
            // Physical bucket #. This determine file name. It also determines the order of splits in the result.
            int partitionBucketNumber = bucketNumber % partitionBucketCount;
            // Logical bucket #. Each logical bucket corresponds to a "bucket" from engine's perspective.
            int readBucketNumber = bucketNumber % readBucketCount;

            boolean containsEligibleTableBucket = false;
            boolean containsIneligibleTableBucket = false;
            for (int tableBucketNumber = bucketNumber
                    % tableBucketCount; tableBucketNumber < tableBucketCount; tableBucketNumber += Math
                            .max(readBucketCount, partitionBucketCount)) {
                // table bucket number: this is used for evaluating "$bucket" filters.
                if (bucketSplitInfo.isTableBucketEnabled(tableBucketNumber)) {
                    containsEligibleTableBucket = true;
                } else {
                    containsIneligibleTableBucket = true;

            if (containsEligibleTableBucket && containsIneligibleTableBucket) {
                throw new PrestoException(NOT_SUPPORTED,
                        "The bucket filter cannot be satisfied. There are restrictions on the bucket filter when all the following is true: "
                                + "1. a table has a different buckets count as at least one of its partitions that is read in this query; "
                                + "2. the table has a different but compatible bucket number with another table in the query; "
                                + "3. some buckets of the table is filtered out from the query, most likely using a filter on \"$bucket\". "
                                + "(table name: " + table.getTableName() + ", table bucket count: "
                                + tableBucketCount + ", " + "partition bucket count: " + partitionBucketCount
                                + ", effective reading bucket count: " + readBucketCount + ")");
            if (containsEligibleTableBucket) {
                LocatedFileStatus file = files.get(partitionBucketNumber);
                splitFactory.createInternalHiveSplit(file, readBucketNumber).ifPresent(splitList::add);
        return splitList;

    private static List<Path> getTargetPathsFromSymlink(FileSystem fileSystem, Path symlinkDir) {
        try {
            FileStatus[] symlinks = fileSystem.listStatus(symlinkDir, HIDDEN_FILES_PATH_FILTER);
            List<Path> targets = new ArrayList<>();

            for (FileStatus symlink : symlinks) {
                try (BufferedReader reader = new BufferedReader(
                        new InputStreamReader(, StandardCharsets.UTF_8))) {
            return targets;
        } catch (IOException e) {
            throw new PrestoException(HIVE_BAD_DATA, "Error parsing symlinks from: " + symlinkDir, e);

    private static List<HivePartitionKey> getPartitionKeys(Table table, Optional<Partition> partition) {
        if (!partition.isPresent()) {
            return ImmutableList.of();
        ImmutableList.Builder<HivePartitionKey> partitionKeys = ImmutableList.builder();
        List<Column> keys = table.getPartitionColumns();
        List<String> values = partition.get().getValues();
        checkCondition(keys.size() == values.size(), HIVE_INVALID_METADATA,
                "Expected %s partition key values, but got %s", keys.size(), values.size());
        for (int i = 0; i < keys.size(); i++) {
            String name = keys.get(i).getName();
            HiveType hiveType = keys.get(i).getType();
            if (!hiveType.isSupportedType()) {
                throw new PrestoException(NOT_SUPPORTED,
                        format("Unsupported Hive type %s found in partition keys of table %s.%s", hiveType,
                                table.getDatabaseName(), table.getTableName()));
            String value = values.get(i);
            checkCondition(value != null, HIVE_INVALID_PARTITION_VALUE,
                    "partition key value cannot be null for field: %s", name);
            partitionKeys.add(new HivePartitionKey(name, value));

    private static Properties getPartitionSchema(Table table, Optional<Partition> partition) {
        if (!partition.isPresent()) {
            return getHiveSchema(table);
        return getHiveSchema(partition.get(), table);

    public static class BucketSplitInfo {
        private final List<HiveColumnHandle> bucketColumns;
        private final int tableBucketCount;
        private final int readBucketCount;
        private final IntPredicate bucketFilter;

        public static Optional<BucketSplitInfo> createBucketSplitInfo(Optional<HiveBucketHandle> bucketHandle,
                Optional<HiveBucketFilter> bucketFilter) {
            requireNonNull(bucketHandle, "bucketHandle is null");
            requireNonNull(bucketFilter, "buckets is null");

            if (!bucketHandle.isPresent()) {
                checkArgument(!bucketFilter.isPresent(), "bucketHandle must be present if bucketFilter is present");
                return Optional.empty();

            int tableBucketCount = bucketHandle.get().getTableBucketCount();
            int readBucketCount = bucketHandle.get().getReadBucketCount();
            List<HiveColumnHandle> bucketColumns = bucketHandle.get().getColumns();
            IntPredicate predicate = bucketFilter.<IntPredicate>map(filter -> filter.getBucketsToKeep()::contains)
                    .orElse(bucket -> true);
            return Optional.of(new BucketSplitInfo(bucketColumns, tableBucketCount, readBucketCount, predicate));

        private BucketSplitInfo(List<HiveColumnHandle> bucketColumns, int tableBucketCount, int readBucketCount,
                IntPredicate bucketFilter) {
            this.bucketColumns = ImmutableList.copyOf(requireNonNull(bucketColumns, "bucketColumns is null"));
            this.tableBucketCount = tableBucketCount;
            this.readBucketCount = readBucketCount;
            this.bucketFilter = requireNonNull(bucketFilter, "bucketFilter is null");

        public List<HiveColumnHandle> getBucketColumns() {
            return bucketColumns;

        public int getTableBucketCount() {
            return tableBucketCount;

        public int getReadBucketCount() {
            return readBucketCount;

         * Evaluates whether the provided table bucket number passes the bucket predicate.
         * A bucket predicate can be present in two cases:
         * <ul>
         * <li>Filter on "$bucket" column. e.g. {@code "$bucket" between 0 and 100}
         * <li>Single-value equality filter on all bucket columns. e.g. for a table with two bucketing columns,
         * {@code bucketCol1 = 'a' AND bucketCol2 = 123}
         * </ul>
        public boolean isTableBucketEnabled(int tableBucketNumber) {
            return bucketFilter.test(tableBucketNumber);