com.facebook.presto.hive.HiveSplitSourceProvider.java Source code

Introduction

Here is the source code for com.facebook.presto.hive.HiveSplitSourceProvider.java
Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.hive;

import com.facebook.presto.hive.util.AsyncWalker;
import com.facebook.presto.hive.util.BoundedExecutor;
import com.facebook.presto.hive.util.FileStatusCallback;
import com.facebook.presto.hive.util.SetThreadName;
import com.facebook.presto.hive.util.SuspendingExecutor;
import com.facebook.presto.spi.ConnectorSession;
import com.facebook.presto.spi.ConnectorSplit;
import com.facebook.presto.spi.ConnectorSplitSource;
import com.facebook.presto.spi.HostAddress;
import com.facebook.presto.spi.PrestoException;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import io.airlift.units.DataSize;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;

import javax.annotation.concurrent.GuardedBy;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Executor;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;

import static com.facebook.presto.hadoop.HadoopFileStatus.isFile;
import static com.facebook.presto.hive.HiveBucketing.HiveBucket;
import static com.facebook.presto.hive.HiveType.getSupportedHiveType;
import static com.facebook.presto.hive.HiveUtil.convertNativeHiveType;
import static com.facebook.presto.hive.HiveUtil.getInputFormat;
import static com.facebook.presto.hive.HiveUtil.isSplittable;
import static com.facebook.presto.hive.UnpartitionedPartition.isUnpartitioned;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;

class HiveSplitSourceProvider {
    private static final ConnectorSplit FINISHED_MARKER = new ConnectorSplit() {
        @Override
        public boolean isRemotelyAccessible() {
            throw new UnsupportedOperationException();
        }

        @Override
        public List<HostAddress> getAddresses() {
            throw new UnsupportedOperationException();
        }

        @Override
        public Object getInfo() {
            throw new UnsupportedOperationException();
        }
    };

    private final String connectorId;
    private final Table table;
    private final Iterable<String> partitionNames;
    private final Iterable<Partition> partitions;
    private final Optional<HiveBucket> bucket;
    private final int maxOutstandingSplits;
    private final int maxThreads;
    private final HdfsEnvironment hdfsEnvironment;
    private final NamenodeStats namenodeStats;
    private final DirectoryLister directoryLister;
    private final Executor executor;
    private final ClassLoader classLoader;
    private final DataSize maxSplitSize;
    private final int maxPartitionBatchSize;
    private final DataSize maxInitialSplitSize;
    private long remainingInitialSplits;
    private final ConnectorSession session;
    private final boolean recursiveDirWalkerEnabled;

    HiveSplitSourceProvider(String connectorId, Table table, Iterable<String> partitionNames,
            Iterable<Partition> partitions, Optional<HiveBucket> bucket, DataSize maxSplitSize,
            int maxOutstandingSplits, int maxThreads, HdfsEnvironment hdfsEnvironment, NamenodeStats namenodeStats,
            DirectoryLister directoryLister, Executor executor, int maxPartitionBatchSize, ConnectorSession session,
            DataSize maxInitialSplitSize, int maxInitialSplits, boolean recursiveDirWalkerEnabled) {
        this.connectorId = connectorId;
        this.table = table;
        this.partitionNames = partitionNames;
        this.partitions = partitions;
        this.bucket = bucket;
        this.maxSplitSize = maxSplitSize;
        this.maxPartitionBatchSize = maxPartitionBatchSize;
        this.maxOutstandingSplits = maxOutstandingSplits;
        this.maxThreads = maxThreads;
        this.hdfsEnvironment = hdfsEnvironment;
        this.namenodeStats = namenodeStats;
        this.directoryLister = directoryLister;
        this.executor = executor;
        this.session = session;
        this.classLoader = Thread.currentThread().getContextClassLoader();
        this.maxInitialSplitSize = maxInitialSplitSize;
        this.remainingInitialSplits = maxInitialSplits;
        this.recursiveDirWalkerEnabled = recursiveDirWalkerEnabled;
    }

    public ConnectorSplitSource get() {
        // Each iterator has its own bounded executor and can be independently suspended
        final SuspendingExecutor suspendingExecutor = new SuspendingExecutor(
                new BoundedExecutor(executor, maxThreads));
        final HiveSplitSource splitSource = new HiveSplitSource(connectorId, maxOutstandingSplits,
                suspendingExecutor);

        FutureTask<?> producer = new FutureTask<>(new Runnable() {
            @Override
            public void run() {
                try (SetThreadName ignored = new SetThreadName("HiveSplitProducer")) {
                    loadPartitionSplits(splitSource, suspendingExecutor, session);
                }
            }
        }, null);

        executor.execute(producer);
        splitSource.setProducerFuture(producer);

        return splitSource;
    }

    private void loadPartitionSplits(final HiveSplitSource hiveSplitSource, SuspendingExecutor suspendingExecutor,
            final ConnectorSession session) {
        final Semaphore semaphore = new Semaphore(maxPartitionBatchSize);
        try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) {
            ImmutableList.Builder<ListenableFuture<Void>> futureBuilder = ImmutableList.builder();

            Iterator<String> nameIterator = partitionNames.iterator();
            for (Partition partition : partitions) {
                checkState(nameIterator.hasNext(), "different number of partitions and partition names!");
                final String partitionName = nameIterator.next();
                final Properties schema = getPartitionSchema(table, partition);
                final List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition);

                Path path = new Path(getPartitionLocation(table, partition));
                final Configuration configuration = hdfsEnvironment.getConfiguration(path);
                final InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);

                FileSystem fs = path.getFileSystem(configuration);

                if (inputFormat instanceof SymlinkTextInputFormat) {
                    JobConf jobConf = new JobConf(configuration);
                    FileInputFormat.setInputPaths(jobConf, path);
                    InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
                    for (InputSplit rawSplit : splits) {
                        FileSplit split = ((SymlinkTextInputFormat.SymlinkTextInputSplit) rawSplit)
                                .getTargetSplit();

                        // get the filesystem for the target path -- it may be a different hdfs instance
                        FileSystem targetFilesystem = split.getPath().getFileSystem(configuration);
                        FileStatus fileStatus = targetFilesystem.getFileStatus(split.getPath());
                        hiveSplitSource.addToQueue(createHiveSplits(partitionName, fileStatus,
                                targetFilesystem.getFileBlockLocations(fileStatus, split.getStart(),
                                        split.getLength()),
                                split.getStart(), split.getLength(), schema, partitionKeys, false, session));
                    }
                    continue;
                }

                // TODO: this is currently serial across all partitions and should be done in suspendingExecutor
                if (bucket.isPresent()) {
                    Optional<FileStatus> bucketFile = getBucketFile(bucket.get(), fs, path);
                    if (bucketFile.isPresent()) {
                        FileStatus file = bucketFile.get();
                        BlockLocation[] blockLocations = fs.getFileBlockLocations(file, 0, file.getLen());
                        boolean splittable = isSplittable(inputFormat, fs, file.getPath());

                        hiveSplitSource.addToQueue(createHiveSplits(partitionName, file, blockLocations, 0,
                                file.getLen(), schema, partitionKeys, splittable, session));
                        continue;
                    }
                }

                // Acquire semaphore so that we only have a fixed number of outstanding partitions being processed asynchronously
                // NOTE: there must not be any calls that throw in the space between acquiring the semaphore and setting the Future
                // callback to release it. Otherwise, we will need a try-finally block around this section.
                try {
                    semaphore.acquire();
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                    return;
                }

                ListenableFuture<Void> partitionFuture = createAsyncWalker(fs, suspendingExecutor).beginWalk(path,
                        new FileStatusCallback() {
                            @Override
                            public void process(FileStatus file, BlockLocation[] blockLocations) {
                                try {
                                    boolean splittable = isSplittable(inputFormat,
                                            file.getPath().getFileSystem(configuration), file.getPath());

                                    hiveSplitSource.addToQueue(createHiveSplits(partitionName, file, blockLocations,
                                            0, file.getLen(), schema, partitionKeys, splittable, session));
                                } catch (IOException e) {
                                    hiveSplitSource.fail(e);
                                }
                            }
                        });

                // release the semaphore when the partition finishes
                Futures.addCallback(partitionFuture, new FutureCallback<Void>() {
                    @Override
                    public void onSuccess(Void result) {
                        semaphore.release();
                    }

                    @Override
                    public void onFailure(Throwable t) {
                        semaphore.release();
                    }
                });

                futureBuilder.add(partitionFuture);
            }

            // when all partitions finish, mark the queue as finished
            Futures.addCallback(Futures.allAsList(futureBuilder.build()), new FutureCallback<List<Void>>() {
                @Override
                public void onSuccess(List<Void> result) {
                    hiveSplitSource.finished();
                }

                @Override
                public void onFailure(Throwable t) {
                    hiveSplitSource.fail(t);
                }
            });
        } catch (Throwable e) {
            hiveSplitSource.fail(e);
            Throwables.propagateIfInstanceOf(e, Error.class);
        }
    }

    private AsyncWalker createAsyncWalker(FileSystem fs, SuspendingExecutor suspendingExecutor) {
        return new AsyncWalker(fs, suspendingExecutor, directoryLister, namenodeStats, recursiveDirWalkerEnabled);
    }

    private static Optional<FileStatus> getBucketFile(HiveBucket bucket, FileSystem fs, Path path) {
        FileStatus[] statuses = listStatus(fs, path);

        if (statuses.length != bucket.getBucketCount()) {
            return Optional.absent();
        }

        Map<String, FileStatus> map = new HashMap<>();
        List<String> paths = new ArrayList<>();
        for (FileStatus status : statuses) {
            if (!isFile(status)) {
                return Optional.absent();
            }
            String pathString = status.getPath().toString();
            map.put(pathString, status);
            paths.add(pathString);
        }

        // Hive sorts the paths as strings lexicographically
        Collections.sort(paths);

        String pathString = paths.get(bucket.getBucketNumber());
        return Optional.of(map.get(pathString));
    }

    private static FileStatus[] listStatus(FileSystem fs, Path path) {
        try {
            return fs.listStatus(path);
        } catch (IOException e) {
            throw Throwables.propagate(e);
        }
    }

    private List<HiveSplit> createHiveSplits(String partitionName, FileStatus file, BlockLocation[] blockLocations,
            long start, long length, Properties schema, List<HivePartitionKey> partitionKeys, boolean splittable,
            ConnectorSession session) throws IOException {
        ImmutableList.Builder<HiveSplit> builder = ImmutableList.builder();
        if (splittable) {
            for (BlockLocation blockLocation : blockLocations) {
                // get the addresses for the block
                List<HostAddress> addresses = toHostAddress(blockLocation.getHosts());

                long maxBytes = maxSplitSize.toBytes();

                if (remainingInitialSplits > 0) {
                    maxBytes = maxInitialSplitSize.toBytes();
                }

                // divide the block into uniform chunks that are smaller than the max split size
                int chunks = Math.max(1, (int) (blockLocation.getLength() / maxBytes));
                // when block does not divide evenly into chunks, make the chunk size slightly bigger than necessary
                long targetChunkSize = (long) Math.ceil(blockLocation.getLength() * 1.0 / chunks);

                long chunkOffset = 0;
                while (chunkOffset < blockLocation.getLength()) {
                    // adjust the actual chunk size to account for the overrun when chunks are slightly bigger than necessary (see above)
                    long chunkLength = Math.min(targetChunkSize, blockLocation.getLength() - chunkOffset);

                    builder.add(new HiveSplit(connectorId, table.getDbName(), table.getTableName(), partitionName,
                            file.getPath().toString(), blockLocation.getOffset() + chunkOffset, chunkLength, schema,
                            partitionKeys, addresses, session));

                    chunkOffset += chunkLength;
                    remainingInitialSplits--;
                }
                checkState(chunkOffset == blockLocation.getLength(), "Error splitting blocks");
            }
        } else {
            // not splittable, use the hosts from the first block if it exists
            List<HostAddress> addresses = ImmutableList.of();
            if (blockLocations.length > 0) {
                addresses = toHostAddress(blockLocations[0].getHosts());
            }

            builder.add(new HiveSplit(connectorId, table.getDbName(), table.getTableName(), partitionName,
                    file.getPath().toString(), start, length, schema, partitionKeys, addresses, session));
        }
        return builder.build();
    }

    private static List<HostAddress> toHostAddress(String[] hosts) {
        ImmutableList.Builder<HostAddress> builder = ImmutableList.builder();
        for (String host : hosts) {
            builder.add(HostAddress.fromString(host));
        }
        return builder.build();
    }

    @VisibleForTesting
    static class HiveSplitSource implements ConnectorSplitSource {
        private final String connectorId;
        private final BlockingQueue<ConnectorSplit> queue = new LinkedBlockingQueue<>();
        private final AtomicInteger outstandingSplitCount = new AtomicInteger();
        private final AtomicReference<Throwable> throwable = new AtomicReference<>();
        private final int maxOutstandingSplits;
        private final SuspendingExecutor suspendingExecutor;
        private volatile boolean closed;

        @GuardedBy("this")
        private Future<?> producerFuture;

        @VisibleForTesting
        HiveSplitSource(String connectorId, int maxOutstandingSplits, SuspendingExecutor suspendingExecutor) {
            this.connectorId = connectorId;
            this.maxOutstandingSplits = maxOutstandingSplits;
            this.suspendingExecutor = suspendingExecutor;
        }

        @VisibleForTesting
        int getOutstandingSplitCount() {
            return outstandingSplitCount.get();
        }

        void addToQueue(Iterable<? extends ConnectorSplit> splits) {
            for (ConnectorSplit split : splits) {
                addToQueue(split);
            }
        }

        @VisibleForTesting
        void addToQueue(ConnectorSplit split) {
            if (throwable.get() == null) {
                queue.add(split);
                if (outstandingSplitCount.incrementAndGet() >= maxOutstandingSplits) {
                    suspendingExecutor.suspend();
                }
            }
        }

        @VisibleForTesting
        void finished() {
            if (throwable.get() == null) {
                queue.add(FINISHED_MARKER);
            }
        }

        @VisibleForTesting
        void fail(Throwable e) {
            // only record the first error message
            if (throwable.compareAndSet(null, e)) {
                // add the finish marker
                queue.add(FINISHED_MARKER);

                // no need to process any more jobs
                suspendingExecutor.suspend();
            }
        }

        @Override
        public String getDataSourceName() {
            return connectorId;
        }

        @Override
        public List<ConnectorSplit> getNextBatch(int maxSize) throws InterruptedException {
            checkState(!closed, "Provider is already closed");

            // wait for at least one split and then take as may extra splits as possible
            // if an error has been registered, the take will succeed immediately because
            // will be at least one finished marker in the queue
            List<ConnectorSplit> splits = new ArrayList<>(maxSize);
            splits.add(queue.take());
            queue.drainTo(splits, maxSize - 1);

            // check if we got the finished marker in our list
            int finishedIndex = splits.indexOf(FINISHED_MARKER);
            if (finishedIndex >= 0) {
                // add the finish marker back to the queue so future callers will not block indefinitely
                queue.add(FINISHED_MARKER);
                // drop all splits after the finish marker (this shouldn't happen in a normal exit, but be safe)
                splits = splits.subList(0, finishedIndex);
            }

            // Before returning, check if there is a registered failure.
            // If so, we want to throw the error, instead of returning because the scheduler can block
            // while scheduling splits and wait for work to finish before continuing.  In this case,
            // we want to end the query as soon as possible and abort the work
            if (throwable.get() != null) {
                throw propagatePrestoException(throwable.get());
            }

            // decrement the outstanding split count by the number of splits we took
            if (outstandingSplitCount.addAndGet(-splits.size()) < maxOutstandingSplits) {
                // we are below the low water mark (and there isn't a failure) so resume scanning hdfs
                suspendingExecutor.resume();
            }

            return splits;
        }

        @Override
        public boolean isFinished() {
            // the finished marker must be checked before checking the throwable
            // to avoid a race with the fail method
            boolean isFinished = queue.peek() == FINISHED_MARKER;
            if (throwable.get() != null) {
                throw propagatePrestoException(throwable.get());
            }
            return isFinished;
        }

        @Override
        public void close() {
            queue.add(FINISHED_MARKER);
            suspendingExecutor.suspend();

            synchronized (this) {
                closed = true;

                if (producerFuture != null) {
                    producerFuture.cancel(true);
                }
            }
        }

        public synchronized void setProducerFuture(Future<?> future) {
            producerFuture = future;

            // someone may have called close before calling this method
            if (closed) {
                producerFuture.cancel(true);
            }
        }

        private RuntimeException propagatePrestoException(Throwable throwable) {
            if (throwable instanceof PrestoException) {
                throw (PrestoException) throwable;
            }
            if (throwable instanceof FileNotFoundException) {
                throw new PrestoException(HiveErrorCode.HIVE_FILE_NOT_FOUND.toErrorCode(), throwable);
            }
            throw new PrestoException(HiveErrorCode.HIVE_UNKNOWN_ERROR.toErrorCode(), throwable);
        }
    }

    private static List<HivePartitionKey> getPartitionKeys(Table table, Partition partition) {
        if (isUnpartitioned(partition)) {
            return ImmutableList.of();
        }
        ImmutableList.Builder<HivePartitionKey> partitionKeys = ImmutableList.builder();
        List<FieldSchema> keys = table.getPartitionKeys();
        List<String> values = partition.getValues();
        checkArgument(keys.size() == values.size(), "Expected %s partition key values, but got %s", keys.size(),
                values.size());
        for (int i = 0; i < keys.size(); i++) {
            String name = keys.get(i).getName();
            PrimitiveObjectInspector.PrimitiveCategory primitiveCategory = convertNativeHiveType(
                    keys.get(i).getType());
            HiveType hiveType = getSupportedHiveType(primitiveCategory);
            String value = values.get(i);
            checkNotNull(value, "partition key value cannot be null for field: %s", name);
            partitionKeys.add(new HivePartitionKey(name, hiveType, value));
        }
        return partitionKeys.build();
    }

    private static Properties getPartitionSchema(Table table, Partition partition) {
        if (isUnpartitioned(partition)) {
            return MetaStoreUtils.getTableMetadata(table);
        }
        return MetaStoreUtils.getSchema(partition, table);
    }

    private static String getPartitionLocation(Table table, Partition partition) {
        if (isUnpartitioned(partition)) {
            return table.getSd().getLocation();
        }
        return partition.getSd().getLocation();
    }
}