org.apache.hadoop.hive.ql.parse.TaskCompiler.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hive.ql.parse.TaskCompiler.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.parse;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.exec.ColumnStatsTask;
import org.apache.hadoop.hive.ql.exec.FetchTask;
import org.apache.hadoop.hive.ql.exec.StatsTask;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.TaskFactory;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.mr.ExecDriver;
import org.apache.hadoop.hive.ql.exec.spark.SparkTask;
import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.hooks.WriteEntity;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils;
import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.AnalyzeRewriteContext;
import org.apache.hadoop.hive.ql.plan.ColumnStatsDesc;
import org.apache.hadoop.hive.ql.plan.ColumnStatsWork;
import org.apache.hadoop.hive.ql.plan.CreateTableDesc;
import org.apache.hadoop.hive.ql.plan.DDLWork;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.LoadFileDesc;
import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
import org.apache.hadoop.hive.ql.plan.MoveWork;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;

import com.google.common.collect.Interner;
import com.google.common.collect.Interners;

/**
 * TaskCompiler is a the base class for classes that compile
 * operator pipelines into tasks.
 */
public abstract class TaskCompiler {

    protected final Log LOG = LogFactory.getLog(TaskCompiler.class);

    protected Hive db;
    protected LogHelper console;
    protected HiveConf conf;

    public void init(HiveConf conf, LogHelper console, Hive db) {
        this.conf = conf;
        this.db = db;
        this.console = console;
    }

    @SuppressWarnings({ "nls", "unchecked" })
    public void compile(final ParseContext pCtx, final List<Task<? extends Serializable>> rootTasks,
            final HashSet<ReadEntity> inputs, final HashSet<WriteEntity> outputs) throws SemanticException {

        Context ctx = pCtx.getContext();
        GlobalLimitCtx globalLimitCtx = pCtx.getGlobalLimitCtx();
        List<Task<MoveWork>> mvTask = new ArrayList<Task<MoveWork>>();

        List<LoadTableDesc> loadTableWork = pCtx.getLoadTableWork();
        List<LoadFileDesc> loadFileWork = pCtx.getLoadFileWork();

        boolean isCStats = pCtx.getQueryProperties().isAnalyzeRewrite();
        int outerQueryLimit = pCtx.getQueryProperties().getOuterQueryLimit();

        if (pCtx.getFetchTask() != null) {
            return;
        }

        optimizeOperatorPlan(pCtx, inputs, outputs);

        /*
         * In case of a select, use a fetch task instead of a move task.
         * If the select is from analyze table column rewrite, don't create a fetch task. Instead create
         * a column stats task later.
         */
        if (pCtx.getQueryProperties().isQuery() && !isCStats) {
            if ((!loadTableWork.isEmpty()) || (loadFileWork.size() != 1)) {
                throw new SemanticException(ErrorMsg.GENERIC_ERROR.getMsg());
            }

            LoadFileDesc loadFileDesc = loadFileWork.get(0);

            String cols = loadFileDesc.getColumns();
            String colTypes = loadFileDesc.getColumnTypes();

            TableDesc resultTab = pCtx.getFetchTableDesc();
            if (resultTab == null) {
                String resFileFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYRESULTFILEFORMAT);
                resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, resFileFormat);
            }

            FetchWork fetch = new FetchWork(loadFileDesc.getSourcePath(), resultTab, outerQueryLimit);
            fetch.setSource(pCtx.getFetchSource());
            fetch.setSink(pCtx.getFetchSink());

            pCtx.setFetchTask((FetchTask) TaskFactory.get(fetch, conf));

            // For the FetchTask, the limit optimization requires we fetch all the rows
            // in memory and count how many rows we get. It's not practical if the
            // limit factor is too big
            int fetchLimit = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVELIMITOPTMAXFETCH);
            if (globalLimitCtx.isEnable() && globalLimitCtx.getGlobalLimit() > fetchLimit) {
                LOG.info("For FetchTask, LIMIT " + globalLimitCtx.getGlobalLimit() + " > " + fetchLimit
                        + ". Doesn't qualify limit optimiztion.");
                globalLimitCtx.disableOpt();

            }
            if (outerQueryLimit == 0) {
                // Believe it or not, some tools do generate queries with limit 0 and than expect
                // query to run quickly. Lets meet their requirement.
                LOG.info("Limit 0. No query execution needed.");
                return;
            }
        } else if (!isCStats) {
            for (LoadTableDesc ltd : loadTableWork) {
                Task<MoveWork> tsk = TaskFactory.get(new MoveWork(null, null, ltd, null, false), conf);
                mvTask.add(tsk);
                // Check to see if we are stale'ing any indexes and auto-update them if we want
                if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEINDEXAUTOUPDATE)) {
                    IndexUpdater indexUpdater = new IndexUpdater(loadTableWork, inputs, conf);
                    try {
                        List<Task<? extends Serializable>> indexUpdateTasks = indexUpdater.generateUpdateTasks();
                        for (Task<? extends Serializable> updateTask : indexUpdateTasks) {
                            tsk.addDependentTask(updateTask);
                        }
                    } catch (HiveException e) {
                        console.printInfo("WARNING: could not auto-update stale indexes, which are not in sync");
                    }
                }
            }

            boolean oneLoadFile = true;
            for (LoadFileDesc lfd : loadFileWork) {
                if (pCtx.getQueryProperties().isCTAS()) {
                    assert (oneLoadFile); // should not have more than 1 load file for
                    // CTAS
                    // make the movetask's destination directory the table's destination.
                    Path location;
                    String loc = pCtx.getCreateTable().getLocation();
                    if (loc == null) {
                        // get the table's default location
                        Path targetPath;
                        try {
                            String[] names = Utilities.getDbTableName(pCtx.getCreateTable().getTableName());
                            if (!db.databaseExists(names[0])) {
                                throw new SemanticException("ERROR: The database " + names[0] + " does not exist.");
                            }
                            Warehouse wh = new Warehouse(conf);
                            targetPath = wh.getTablePath(db.getDatabase(names[0]), names[1]);
                        } catch (HiveException e) {
                            throw new SemanticException(e);
                        } catch (MetaException e) {
                            throw new SemanticException(e);
                        }

                        location = targetPath;
                    } else {
                        location = new Path(loc);
                    }
                    lfd.setTargetDir(location);

                    oneLoadFile = false;
                }
                mvTask.add(TaskFactory.get(new MoveWork(null, null, null, lfd, false), conf));
            }
        }

        generateTaskTree(rootTasks, pCtx, mvTask, inputs, outputs);

        /*
         * If the query was the result of analyze table column compute statistics rewrite, create
         * a column stats task instead of a fetch task to persist stats to the metastore.
         */
        if (isCStats) {
            genColumnStatsTask(pCtx.getAnalyzeRewrite(), loadTableWork, loadFileWork, rootTasks, outerQueryLimit);
        }

        // For each task, set the key descriptor for the reducer
        for (Task<? extends Serializable> rootTask : rootTasks) {
            GenMapRedUtils.setKeyAndValueDescForTaskTree(rootTask);
        }

        // If a task contains an operator which instructs bucketizedhiveinputformat
        // to be used, please do so
        for (Task<? extends Serializable> rootTask : rootTasks) {
            setInputFormat(rootTask);
        }

        optimizeTaskPlan(rootTasks, pCtx, ctx);

        decideExecMode(rootTasks, ctx, globalLimitCtx);

        if (pCtx.getQueryProperties().isCTAS()) {
            // generate a DDL task and make it a dependent task of the leaf
            CreateTableDesc crtTblDesc = pCtx.getCreateTable();

            crtTblDesc.validate(conf);

            // clear the mapredWork output file from outputs for CTAS
            // DDLWork at the tail of the chain will have the output
            Iterator<WriteEntity> outIter = outputs.iterator();
            while (outIter.hasNext()) {
                switch (outIter.next().getType()) {
                case DFS_DIR:
                case LOCAL_DIR:
                    outIter.remove();
                    break;
                default:
                    break;
                }
            }
            Task<? extends Serializable> crtTblTask = TaskFactory.get(new DDLWork(inputs, outputs, crtTblDesc),
                    conf);

            // find all leaf tasks and make the DDLTask as a dependent task of all of
            // them
            HashSet<Task<? extends Serializable>> leaves = new LinkedHashSet<Task<? extends Serializable>>();
            getLeafTasks(rootTasks, leaves);
            assert (leaves.size() > 0);
            for (Task<? extends Serializable> task : leaves) {
                if (task instanceof StatsTask) {
                    // StatsTask require table to already exist
                    for (Task<? extends Serializable> parentOfStatsTask : task.getParentTasks()) {
                        parentOfStatsTask.addDependentTask(crtTblTask);
                    }
                    for (Task<? extends Serializable> parentOfCrtTblTask : crtTblTask.getParentTasks()) {
                        parentOfCrtTblTask.removeDependentTask(task);
                    }
                    crtTblTask.addDependentTask(task);
                } else {
                    task.addDependentTask(crtTblTask);
                }
            }
        }

        if (globalLimitCtx.isEnable() && pCtx.getFetchTask() != null) {
            LOG.info("set least row check for FetchTask: " + globalLimitCtx.getGlobalLimit());
            pCtx.getFetchTask().getWork().setLeastNumRows(globalLimitCtx.getGlobalLimit());
        }

        if (globalLimitCtx.isEnable() && globalLimitCtx.getLastReduceLimitDesc() != null) {
            LOG.info("set least row check for LimitDesc: " + globalLimitCtx.getGlobalLimit());
            globalLimitCtx.getLastReduceLimitDesc().setLeastRows(globalLimitCtx.getGlobalLimit());
            List<ExecDriver> mrTasks = Utilities.getMRTasks(rootTasks);
            for (ExecDriver tsk : mrTasks) {
                tsk.setRetryCmdWhenFail(true);
            }
            List<SparkTask> sparkTasks = Utilities.getSparkTasks(rootTasks);
            for (SparkTask sparkTask : sparkTasks) {
                sparkTask.setRetryCmdWhenFail(true);
            }
        }

        Interner<TableDesc> interner = Interners.newStrongInterner();
        for (Task<? extends Serializable> rootTask : rootTasks) {
            GenMapRedUtils.internTableDesc(rootTask, interner);
        }
    }

    /**
     * A helper function to generate a column stats task on top of map-red task. The column stats
     * task fetches from the output of the map-red task, constructs the column stats object and
     * persists it to the metastore.
     *
     * This method generates a plan with a column stats task on top of map-red task and sets up the
     * appropriate metadata to be used during execution.
     *
     * @param qb
     */
    @SuppressWarnings("unchecked")
    protected void genColumnStatsTask(AnalyzeRewriteContext analyzeRewrite, List<LoadTableDesc> loadTableWork,
            List<LoadFileDesc> loadFileWork, List<Task<? extends Serializable>> rootTasks, int outerQueryLimit) {
        ColumnStatsTask cStatsTask = null;
        ColumnStatsWork cStatsWork = null;
        FetchWork fetch = null;
        String tableName = analyzeRewrite.getTableName();
        List<String> colName = analyzeRewrite.getColName();
        List<String> colType = analyzeRewrite.getColType();
        boolean isTblLevel = analyzeRewrite.isTblLvl();

        String cols = loadFileWork.get(0).getColumns();
        String colTypes = loadFileWork.get(0).getColumnTypes();

        String resFileFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYRESULTFILEFORMAT);
        TableDesc resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, resFileFormat);

        fetch = new FetchWork(loadFileWork.get(0).getSourcePath(), resultTab, outerQueryLimit);

        ColumnStatsDesc cStatsDesc = new ColumnStatsDesc(tableName, colName, colType, isTblLevel);
        cStatsWork = new ColumnStatsWork(fetch, cStatsDesc);
        cStatsTask = (ColumnStatsTask) TaskFactory.get(cStatsWork, conf);
        // This is a column stats task. According to the semantic, there should be
        // only one MR task in the rootTask.
        rootTasks.get(0).addDependentTask(cStatsTask);
    }

    /**
     * Find all leaf tasks of the list of root tasks.
     */
    protected void getLeafTasks(List<Task<? extends Serializable>> rootTasks,
            HashSet<Task<? extends Serializable>> leaves) {

        for (Task<? extends Serializable> root : rootTasks) {
            getLeafTasks(root, leaves);
        }
    }

    private void getLeafTasks(Task<? extends Serializable> task, HashSet<Task<? extends Serializable>> leaves) {
        if (task.getDependentTasks() == null) {
            if (!leaves.contains(task)) {
                leaves.add(task);
            }
        } else {
            getLeafTasks(task.getDependentTasks(), leaves);
        }
    }

    /*
     * Called to transform tasks into local tasks where possible/desirable
     */
    protected abstract void decideExecMode(List<Task<? extends Serializable>> rootTasks, Context ctx,
            GlobalLimitCtx globalLimitCtx) throws SemanticException;

    /*
     * Called at the beginning of the compile phase to have another chance to optimize the operator plan
     */
    protected void optimizeOperatorPlan(ParseContext pCtxSet, Set<ReadEntity> inputs, Set<WriteEntity> outputs)
            throws SemanticException {
    }

    /*
     * Called after the tasks have been generated to run another round of optimization
     */
    protected abstract void optimizeTaskPlan(List<Task<? extends Serializable>> rootTasks, ParseContext pCtx,
            Context ctx) throws SemanticException;

    /*
     * Called to set the appropriate input format for tasks
     */
    protected abstract void setInputFormat(Task<? extends Serializable> rootTask);

    /*
     * Called to generate the taks tree from the parse context/operator tree
     */
    protected abstract void generateTaskTree(List<Task<? extends Serializable>> rootTasks, ParseContext pCtx,
            List<Task<MoveWork>> mvTask, Set<ReadEntity> inputs, Set<WriteEntity> outputs) throws SemanticException;

    /**
     * Create a clone of the parse context
     */
    public ParseContext getParseContext(ParseContext pCtx, List<Task<? extends Serializable>> rootTasks) {
        ParseContext clone = new ParseContext(conf, pCtx.getOpToPartPruner(), pCtx.getOpToPartList(),
                pCtx.getTopOps(), pCtx.getJoinOps(), pCtx.getSmbMapJoinOps(), pCtx.getLoadTableWork(),
                pCtx.getLoadFileWork(), pCtx.getContext(), pCtx.getIdToTableNameMap(), pCtx.getDestTableId(),
                pCtx.getUCtx(), pCtx.getListMapJoinOpsNoReducer(), pCtx.getPrunedPartitions(),
                pCtx.getOpToSamplePruner(), pCtx.getGlobalLimitCtx(), pCtx.getNameToSplitSample(),
                pCtx.getSemanticInputs(), rootTasks, pCtx.getOpToPartToSkewedPruner(), pCtx.getViewAliasToInput(),
                pCtx.getReduceSinkOperatorsAddedByEnforceBucketingSorting(), pCtx.getAnalyzeRewrite(),
                pCtx.getCreateTable(), pCtx.getQueryProperties());
        clone.setFetchTask(pCtx.getFetchTask());
        clone.setLineageInfo(pCtx.getLineageInfo());
        clone.setMapJoinOps(pCtx.getMapJoinOps());
        return clone;
    }
}