org.apache.hadoop.hive.ql.optimizer.SimpleFetchOptimizer.java Source code

Introduction

Here is the source code for org.apache.hadoop.hive.ql.optimizer.SimpleFetchOptimizer.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.optimizer;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicLong;

import com.google.common.collect.Lists;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.ql.stats.StatsUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.CommonJoinOperator;
import org.apache.hadoop.hive.ql.exec.FetchTask;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.exec.FilterOperator;
import org.apache.hadoop.hive.ql.exec.LimitOperator;
import org.apache.hadoop.hive.ql.exec.ListSinkOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorFactory;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.ScriptOperator;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.TaskFactory;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat;
import org.apache.hadoop.hive.ql.io.HiveInputFormat;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
import org.apache.hadoop.hive.ql.metadata.InputEstimator;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.parse.SplitSample;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.ListSinkDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
import org.apache.hadoop.hive.ql.plan.SelectDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToBinary;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToChar;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToDate;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToDecimal;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUnixTimeStamp;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUtcTimestamp;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToVarchar;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;

/**
 * Tries to convert simple fetch query to single fetch task, which fetches rows directly
 * from location of table/partition.
 */
public class SimpleFetchOptimizer extends Transform {

    private final Logger LOG = LoggerFactory.getLogger(SimpleFetchOptimizer.class.getName());

    @Override
    public ParseContext transform(ParseContext pctx) throws SemanticException {
        Map<String, TableScanOperator> topOps = pctx.getTopOps();
        if (pctx.getQueryProperties().isQuery() && !pctx.getQueryProperties().isAnalyzeCommand()
                && topOps.size() == 1) {
            // no join, no groupby, no distinct, no lateral view, no subq,
            // no CTAS or insert, not analyze command, and single sourced.
            String alias = (String) pctx.getTopOps().keySet().toArray()[0];
            TableScanOperator topOp = pctx.getTopOps().values().iterator().next();
            try {
                FetchTask fetchTask = optimize(pctx, alias, topOp);
                if (fetchTask != null) {
                    pctx.setFetchTask(fetchTask);
                }
            } catch (Exception e) {
                // Has to use full name to make sure it does not conflict with
                // org.apache.commons.lang.StringUtils
                LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
                if (e instanceof SemanticException) {
                    throw (SemanticException) e;
                }
                throw new SemanticException(e.getMessage(), e);
            }
        }
        return pctx;
    }

    // returns non-null FetchTask instance when succeeded
    @SuppressWarnings("unchecked")
    private FetchTask optimize(ParseContext pctx, String alias, TableScanOperator source) throws Exception {
        String mode = HiveConf.getVar(pctx.getConf(), HiveConf.ConfVars.HIVEFETCHTASKCONVERSION);

        boolean aggressive = "more".equals(mode);
        final int limit = pctx.getQueryProperties().getOuterQueryLimit();
        // limit = 0 means that we do not need any task.
        if (limit == 0) {
            return null;
        }
        FetchData fetch = checkTree(aggressive, pctx, alias, source);
        if (fetch != null && checkThreshold(fetch, limit, pctx)) {
            FetchWork fetchWork = fetch.convertToWork();
            FetchTask fetchTask = (FetchTask) TaskFactory.get(fetchWork, pctx.getConf());
            fetchWork.setSink(fetch.completed(pctx, fetchWork));
            fetchWork.setSource(source);
            fetchWork.setLimit(limit);
            return fetchTask;
        }
        return null;
    }

    private boolean checkThreshold(FetchData data, int limit, ParseContext pctx) throws Exception {
        if (limit > 0) {
            if (data.hasOnlyPruningFilter()) {
                /* partitioned table + query has only pruning filters */
                return true;
            } else if (data.isPartitioned() == false && data.isFiltered() == false) {
                /* unpartitioned table + no filters */
                return true;
            }
            /* fall through */
        }
        long threshold = HiveConf.getLongVar(pctx.getConf(), HiveConf.ConfVars.HIVEFETCHTASKCONVERSIONTHRESHOLD);
        if (threshold < 0) {
            return true;
        }
        Operator child = data.scanOp.getChildOperators().get(0);
        if (child instanceof SelectOperator) {
            // select *, constant and casts can be allowed without a threshold check
            if (checkExpressions((SelectOperator) child)) {
                return true;
            }
        }
        return data.isDataLengthWithInThreshold(pctx, threshold);
    }

    // all we can handle is LimitOperator, FilterOperator SelectOperator and final FS
    //
    // for non-aggressive mode (minimal)
    // 1. sampling is not allowed
    // 2. for partitioned table, all filters should be targeted to partition column
    // 3. SelectOperator should use only simple cast/column access
    private FetchData checkTree(boolean aggressive, ParseContext pctx, String alias, TableScanOperator ts)
            throws HiveException {
        SplitSample splitSample = pctx.getNameToSplitSample().get(alias);
        if (!aggressive && splitSample != null) {
            return null;
        }
        if (!aggressive && ts.getConf().getTableSample() != null) {
            return null;
        }
        Table table = ts.getConf().getTableMetadata();
        if (table == null) {
            return null;
        }
        ReadEntity parent = PlanUtils.getParentViewInfo(alias, pctx.getViewAliasToInput());
        if (!table.isPartitioned()) {
            FetchData fetch = new FetchData(ts, parent, table, splitSample);
            return checkOperators(fetch, aggressive, false);
        }

        boolean bypassFilter = false;
        if (HiveConf.getBoolVar(pctx.getConf(), HiveConf.ConfVars.HIVEOPTPPD)) {
            ExprNodeDesc pruner = pctx.getOpToPartPruner().get(ts);
            if (PartitionPruner.onlyContainsPartnCols(table, pruner)) {
                bypassFilter = !pctx.getPrunedPartitions(alias, ts).hasUnknownPartitions();
            }
        }
        if (!aggressive && !bypassFilter) {
            return null;
        }
        PrunedPartitionList partitions = pctx.getPrunedPartitions(alias, ts);
        FetchData fetch = new FetchData(ts, parent, table, partitions, splitSample, bypassFilter);
        return checkOperators(fetch, aggressive, bypassFilter);
    }

    private FetchData checkOperators(FetchData fetch, boolean aggressive, boolean bypassFilter) {
        if (aggressive) {
            return isConvertible(fetch) ? fetch : null;
        }
        return checkOperators(fetch, fetch.scanOp, bypassFilter);
    }

    private FetchData checkOperators(FetchData fetch, TableScanOperator ts, boolean bypassFilter) {
        if (ts.getChildOperators().size() != 1) {
            return null;
        }
        Operator<?> op = ts.getChildOperators().get(0);
        for (;; op = op.getChildOperators().get(0)) {
            if (op instanceof SelectOperator) {
                if (!checkExpressions((SelectOperator) op)) {
                    return null;
                }
                continue;
            }

            if (!(op instanceof LimitOperator || (op instanceof FilterOperator && bypassFilter))) {
                break;
            }

            if (op.getChildOperators() == null || op.getChildOperators().size() != 1) {
                return null;
            }

            if (op instanceof FilterOperator) {
                fetch.setFiltered(true);
            }
        }

        if (op instanceof FileSinkOperator) {
            fetch.fileSink = op;
            return fetch;
        }

        return null;
    }

    private boolean checkExpressions(SelectOperator op) {
        SelectDesc desc = op.getConf();
        if (desc.isSelectStar() || desc.isSelStarNoCompute()) {
            return true;
        }
        for (ExprNodeDesc expr : desc.getColList()) {
            if (!checkExpression(expr)) {
                return false;
            }
        }
        return true;
    }

    private boolean checkExpression(ExprNodeDesc expr) {
        if (expr instanceof ExprNodeConstantDesc || expr instanceof ExprNodeColumnDesc) {
            return true;
        }

        if (expr instanceof ExprNodeGenericFuncDesc) {
            GenericUDF udf = ((ExprNodeGenericFuncDesc) expr).getGenericUDF();
            if (udf instanceof GenericUDFToBinary || udf instanceof GenericUDFToChar
                    || udf instanceof GenericUDFToDate || udf instanceof GenericUDFToDecimal
                    || udf instanceof GenericUDFToUnixTimeStamp || udf instanceof GenericUDFToUtcTimestamp
                    || udf instanceof GenericUDFToVarchar) {
                return expr.getChildren().size() == 1 && checkExpression(expr.getChildren().get(0));
            }
        }
        return false;
    }

    private boolean isConvertible(FetchData fetch) {
        return isConvertible(fetch, fetch.scanOp, new HashSet<Operator<?>>());
    }

    private boolean isConvertible(FetchData fetch, Operator<?> operator, Set<Operator<?>> traversed) {
        if (operator instanceof ReduceSinkOperator || operator instanceof CommonJoinOperator
                || operator instanceof ScriptOperator) {
            return false;
        }

        if (operator instanceof FilterOperator) {
            fetch.setFiltered(true);
        }

        if (!traversed.add(operator)) {
            return true;
        }
        if (operator.getNumChild() == 0) {
            if (operator instanceof FileSinkOperator) {
                fetch.fileSink = operator;
                return true;
            }
            return false;
        }
        for (Operator<?> child : operator.getChildOperators()) {
            if (!traversed.containsAll(child.getParentOperators())) {
                continue;
            }
            if (!isConvertible(fetch, child, traversed)) {
                return false;
            }
        }
        return true;
    }

    enum Status {
        PASS, FAIL, UNAVAILABLE
    }

    private class FetchData {

        // source table scan
        private final TableScanOperator scanOp;
        private final ReadEntity parent;

        private final Table table;
        private final SplitSample splitSample;
        private final PrunedPartitionList partsList;
        private final Set<ReadEntity> inputs = new LinkedHashSet<ReadEntity>();
        private final boolean onlyPruningFilter;

        // this is always non-null when conversion is completed
        private Operator<?> fileSink;
        private boolean filtered;

        private FetchData(TableScanOperator scanOp, ReadEntity parent, Table table, SplitSample splitSample) {
            this.scanOp = scanOp;
            this.parent = parent;
            this.table = table;
            this.partsList = null;
            this.splitSample = splitSample;
            this.onlyPruningFilter = false;
        }

        private FetchData(TableScanOperator scanOp, ReadEntity parent, Table table, PrunedPartitionList partsList,
                SplitSample splitSample, boolean bypassFilter) {
            this.scanOp = scanOp;
            this.parent = parent;
            this.table = table;
            this.partsList = partsList;
            this.splitSample = splitSample;
            this.onlyPruningFilter = bypassFilter;
        }

        /*
         * all filters were executed during partition pruning
         */
        public final boolean hasOnlyPruningFilter() {
            return this.onlyPruningFilter;
        }

        public final boolean isPartitioned() {
            return this.table.isPartitioned();
        }

        /* there are filter operators in the pipeline */
        public final boolean isFiltered() {
            return this.filtered;
        }

        public final void setFiltered(boolean filtered) {
            this.filtered = filtered;
        }

        private FetchWork convertToWork() throws HiveException {
            inputs.clear();
            Utilities.addSchemaEvolutionToTableScanOperator(table, scanOp);
            TableDesc tableDesc = Utilities.getTableDesc(table);
            if (!table.isPartitioned()) {
                inputs.add(new ReadEntity(table, parent, !table.isView() && parent == null));
                FetchWork work = new FetchWork(table.getPath(), tableDesc);
                PlanUtils.configureInputJobPropertiesForStorageHandler(work.getTblDesc());
                work.setSplitSample(splitSample);
                return work;
            }
            List<Path> listP = new ArrayList<Path>();
            List<PartitionDesc> partP = new ArrayList<PartitionDesc>();

            for (Partition partition : partsList.getNotDeniedPartns()) {
                inputs.add(new ReadEntity(partition, parent, parent == null));
                listP.add(partition.getDataLocation());
                partP.add(Utilities.getPartitionDescFromTableDesc(tableDesc, partition, true));
            }
            Table sourceTable = partsList.getSourceTable();
            inputs.add(new ReadEntity(sourceTable, parent, parent == null));
            TableDesc table = Utilities.getTableDesc(sourceTable);
            FetchWork work = new FetchWork(listP, partP, table);
            if (!work.getPartDesc().isEmpty()) {
                PartitionDesc part0 = work.getPartDesc().get(0);
                PlanUtils.configureInputJobPropertiesForStorageHandler(part0.getTableDesc());
                work.setSplitSample(splitSample);
            }
            return work;
        }

        // this optimizer is for replacing FS to temp+fetching from temp with
        // single direct fetching, which means FS is not needed any more when conversion completed.
        // rows forwarded will be received by ListSinkOperator, which is replacing FS
        private ListSinkOperator completed(ParseContext pctx, FetchWork work) {
            for (ReadEntity input : inputs) {
                PlanUtils.addInput(pctx.getSemanticInputs(), input);
            }
            return replaceFSwithLS(fileSink, work.getSerializationNullFormat());
        }

        private boolean isDataLengthWithInThreshold(ParseContext pctx, final long threshold) throws Exception {
            if (splitSample != null && splitSample.getTotalLength() != null) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Threshold " + splitSample.getTotalLength() + " exceeded for pseudoMR mode");
                }
                return (threshold - splitSample.getTotalLength()) > 0;
            }

            Status status = checkThresholdWithMetastoreStats(table, partsList, threshold);
            if (status.equals(Status.PASS)) {
                return true;
            } else if (status.equals(Status.FAIL)) {
                return false;
            } else {
                LOG.info("Cannot fetch stats from metastore for table: {}. Falling back to filesystem scan..",
                        table.getCompleteName());
                // metastore stats is unavailable, fallback to old way
                final JobConf jobConf = new JobConf(pctx.getConf());
                Utilities.setColumnNameList(jobConf, scanOp, true);
                Utilities.setColumnTypeList(jobConf, scanOp, true);
                HiveStorageHandler handler = table.getStorageHandler();
                if (handler instanceof InputEstimator) {
                    InputEstimator estimator = (InputEstimator) handler;
                    TableDesc tableDesc = Utilities.getTableDesc(table);
                    PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc);
                    Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf);
                    long len = estimator.estimate(jobConf, scanOp, threshold).getTotalLength();
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Threshold " + len + " exceeded for pseudoMR mode");
                    }
                    return (threshold - len) > 0;
                }
                if (table.isNonNative()) {
                    return true; // nothing can be done
                }
                if (!table.isPartitioned()) {
                    long len = getPathLength(jobConf, table.getPath(), table.getInputFormatClass(), threshold);
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Threshold " + len + " exceeded for pseudoMR mode");
                    }
                    return (threshold - len) > 0;
                }
                final AtomicLong total = new AtomicLong(0);
                //TODO: use common thread pool later?
                int threadCount = HiveConf.getIntVar(pctx.getConf(),
                        HiveConf.ConfVars.HIVE_STATS_GATHER_NUM_THREADS);
                final ExecutorService pool = (threadCount > 0)
                        ? Executors
                                .newFixedThreadPool(threadCount,
                                        new ThreadFactoryBuilder().setDaemon(true)
                                                .setNameFormat("SimpleFetchOptimizer-FileLength-%d").build())
                        : null;
                try {
                    List<Future> futures = Lists.newLinkedList();
                    for (final Partition partition : partsList.getNotDeniedPartns()) {
                        final Path path = partition.getDataLocation();
                        if (pool != null) {
                            futures.add(pool.submit(new Callable<Long>() {
                                @Override
                                public Long call() throws Exception {
                                    long len = getPathLength(jobConf, path, partition.getInputFormatClass(),
                                            threshold);
                                    LOG.trace(path + ", length=" + len);
                                    return total.addAndGet(len);
                                }
                            }));
                        } else {
                            total.addAndGet(
                                    getPathLength(jobConf, path, partition.getInputFormatClass(), threshold));
                        }
                    }
                    if (pool != null) {
                        pool.shutdown();
                        for (Future<Long> future : futures) {
                            long totalLen = future.get();
                            if ((threshold - totalLen) <= 0) {
                                // early exit, as getting file lengths can be expensive in object stores.
                                return false;
                            }
                        }
                    }
                    return (threshold - total.get()) >= 0;
                } finally {
                    LOG.info("Data set size=" + total.get() + ", threshold=" + threshold);
                    if (pool != null) {
                        pool.shutdownNow();
                    }
                }
            }
        }

        // This method gets the basic stats from metastore for table/partitions. This will make use of the statistics from
        // AnnotateWithStatistics optimizer when available. If execution engine is tez or spark, AnnotateWithStatistics
        // optimization is applied only during physical compilation because of DPP changing the stats. In such case, we
        // we will get the basic stats from metastore. When statistics is absent in metastore we will use the fallback of
        // scanning the filesystem to get file lengths.
        private Status checkThresholdWithMetastoreStats(final Table table, final PrunedPartitionList partsList,
                final long threshold) {
            if (table != null && !table.isPartitioned()) {
                long dataSize = StatsUtils.getTotalSize(table);
                if (dataSize <= 0) {
                    LOG.warn("Cannot determine basic stats for table: {} from metastore. Falling back.",
                            table.getCompleteName());
                    return Status.UNAVAILABLE;
                }

                return (threshold - dataSize) >= 0 ? Status.PASS : Status.FAIL;
            } else if (table != null && table.isPartitioned() && partsList != null) {
                List<Long> dataSizes = StatsUtils.getBasicStatForPartitions(table, partsList.getNotDeniedPartns(),
                        StatsSetupConst.TOTAL_SIZE);
                long totalDataSize = StatsUtils.getSumIgnoreNegatives(dataSizes);
                if (totalDataSize <= 0) {
                    LOG.warn("Cannot determine basic stats for partitioned table: {} from metastore. Falling back.",
                            table.getCompleteName());
                    return Status.UNAVAILABLE;
                }

                return (threshold - totalDataSize) >= 0 ? Status.PASS : Status.FAIL;
            }

            return Status.UNAVAILABLE;
        }

        private long getPathLength(JobConf conf, Path path, Class<? extends InputFormat> clazz, long threshold)
                throws IOException {
            if (ContentSummaryInputFormat.class.isAssignableFrom(clazz)) {
                InputFormat input = HiveInputFormat.getInputFormatFromCache(clazz, conf);
                return ((ContentSummaryInputFormat) input).getContentSummary(path, conf).getLength();
            } else {
                FileSystem fs = path.getFileSystem(conf);
                try {
                    long length = 0;
                    RemoteIterator<LocatedFileStatus> results = fs.listFiles(path, true);
                    // No need to iterate more, when threshold is reached
                    // (beneficial especially for object stores)
                    while (length <= threshold && results.hasNext()) {
                        length += results.next().getLen();
                    }
                    LOG.trace("length=" + length + ", threshold=" + threshold);
                    return length;
                } catch (FileNotFoundException e) {
                    return 0;
                }
            }
        }
    }

    public static ListSinkOperator replaceFSwithLS(Operator<?> fileSink, String nullFormat) {
        ListSinkDesc desc = new ListSinkDesc(nullFormat);
        ListSinkOperator sink = (ListSinkOperator) OperatorFactory.get(fileSink.getCompilationOpContext(), desc);

        sink.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>());
        Operator<? extends OperatorDesc> parent = fileSink.getParentOperators().get(0);
        sink.getParentOperators().add(parent);
        parent.replaceChild(fileSink, sink);
        fileSink.setParentOperators(null);
        return sink;
    }
}