Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.parse; import java.io.Serializable; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.Warehouse; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.ql.Context; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.exec.ColumnStatsTask; import org.apache.hadoop.hive.ql.exec.FetchTask; import org.apache.hadoop.hive.ql.exec.StatsTask; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskFactory; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.mr.ExecDriver; import org.apache.hadoop.hive.ql.exec.spark.SparkTask; import org.apache.hadoop.hive.ql.hooks.ReadEntity; import org.apache.hadoop.hive.ql.hooks.WriteEntity; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils; import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.AnalyzeRewriteContext; import org.apache.hadoop.hive.ql.plan.ColumnStatsDesc; import org.apache.hadoop.hive.ql.plan.ColumnStatsWork; import org.apache.hadoop.hive.ql.plan.CreateTableDesc; import org.apache.hadoop.hive.ql.plan.DDLWork; import org.apache.hadoop.hive.ql.plan.FetchWork; import org.apache.hadoop.hive.ql.plan.LoadFileDesc; import org.apache.hadoop.hive.ql.plan.LoadTableDesc; import org.apache.hadoop.hive.ql.plan.MoveWork; import org.apache.hadoop.hive.ql.plan.PlanUtils; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.session.SessionState.LogHelper; import com.google.common.collect.Interner; import com.google.common.collect.Interners; /** * TaskCompiler is a the base class for classes that compile * operator pipelines into tasks. */ public abstract class TaskCompiler { protected final Log LOG = LogFactory.getLog(TaskCompiler.class); protected Hive db; protected LogHelper console; protected HiveConf conf; public void init(HiveConf conf, LogHelper console, Hive db) { this.conf = conf; this.db = db; this.console = console; } @SuppressWarnings({ "nls", "unchecked" }) public void compile(final ParseContext pCtx, final List<Task<? extends Serializable>> rootTasks, final HashSet<ReadEntity> inputs, final HashSet<WriteEntity> outputs) throws SemanticException { Context ctx = pCtx.getContext(); GlobalLimitCtx globalLimitCtx = pCtx.getGlobalLimitCtx(); List<Task<MoveWork>> mvTask = new ArrayList<Task<MoveWork>>(); List<LoadTableDesc> loadTableWork = pCtx.getLoadTableWork(); List<LoadFileDesc> loadFileWork = pCtx.getLoadFileWork(); boolean isCStats = pCtx.getQueryProperties().isAnalyzeRewrite(); int outerQueryLimit = pCtx.getQueryProperties().getOuterQueryLimit(); if (pCtx.getFetchTask() != null) { return; } optimizeOperatorPlan(pCtx, inputs, outputs); /* * In case of a select, use a fetch task instead of a move task. * If the select is from analyze table column rewrite, don't create a fetch task. Instead create * a column stats task later. */ if (pCtx.getQueryProperties().isQuery() && !isCStats) { if ((!loadTableWork.isEmpty()) || (loadFileWork.size() != 1)) { throw new SemanticException(ErrorMsg.GENERIC_ERROR.getMsg()); } LoadFileDesc loadFileDesc = loadFileWork.get(0); String cols = loadFileDesc.getColumns(); String colTypes = loadFileDesc.getColumnTypes(); TableDesc resultTab = pCtx.getFetchTableDesc(); if (resultTab == null) { String resFileFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYRESULTFILEFORMAT); resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, resFileFormat); } FetchWork fetch = new FetchWork(loadFileDesc.getSourcePath(), resultTab, outerQueryLimit); fetch.setSource(pCtx.getFetchSource()); fetch.setSink(pCtx.getFetchSink()); pCtx.setFetchTask((FetchTask) TaskFactory.get(fetch, conf)); // For the FetchTask, the limit optimization requires we fetch all the rows // in memory and count how many rows we get. It's not practical if the // limit factor is too big int fetchLimit = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVELIMITOPTMAXFETCH); if (globalLimitCtx.isEnable() && globalLimitCtx.getGlobalLimit() > fetchLimit) { LOG.info("For FetchTask, LIMIT " + globalLimitCtx.getGlobalLimit() + " > " + fetchLimit + ". Doesn't qualify limit optimiztion."); globalLimitCtx.disableOpt(); } if (outerQueryLimit == 0) { // Believe it or not, some tools do generate queries with limit 0 and than expect // query to run quickly. Lets meet their requirement. LOG.info("Limit 0. No query execution needed."); return; } } else if (!isCStats) { for (LoadTableDesc ltd : loadTableWork) { Task<MoveWork> tsk = TaskFactory.get(new MoveWork(null, null, ltd, null, false), conf); mvTask.add(tsk); // Check to see if we are stale'ing any indexes and auto-update them if we want if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEINDEXAUTOUPDATE)) { IndexUpdater indexUpdater = new IndexUpdater(loadTableWork, inputs, conf); try { List<Task<? extends Serializable>> indexUpdateTasks = indexUpdater.generateUpdateTasks(); for (Task<? extends Serializable> updateTask : indexUpdateTasks) { tsk.addDependentTask(updateTask); } } catch (HiveException e) { console.printInfo("WARNING: could not auto-update stale indexes, which are not in sync"); } } } boolean oneLoadFile = true; for (LoadFileDesc lfd : loadFileWork) { if (pCtx.getQueryProperties().isCTAS()) { assert (oneLoadFile); // should not have more than 1 load file for // CTAS // make the movetask's destination directory the table's destination. Path location; String loc = pCtx.getCreateTable().getLocation(); if (loc == null) { // get the table's default location Path targetPath; try { String[] names = Utilities.getDbTableName(pCtx.getCreateTable().getTableName()); if (!db.databaseExists(names[0])) { throw new SemanticException("ERROR: The database " + names[0] + " does not exist."); } Warehouse wh = new Warehouse(conf); targetPath = wh.getTablePath(db.getDatabase(names[0]), names[1]); } catch (HiveException e) { throw new SemanticException(e); } catch (MetaException e) { throw new SemanticException(e); } location = targetPath; } else { location = new Path(loc); } lfd.setTargetDir(location); oneLoadFile = false; } mvTask.add(TaskFactory.get(new MoveWork(null, null, null, lfd, false), conf)); } } generateTaskTree(rootTasks, pCtx, mvTask, inputs, outputs); /* * If the query was the result of analyze table column compute statistics rewrite, create * a column stats task instead of a fetch task to persist stats to the metastore. */ if (isCStats) { genColumnStatsTask(pCtx.getAnalyzeRewrite(), loadTableWork, loadFileWork, rootTasks, outerQueryLimit); } // For each task, set the key descriptor for the reducer for (Task<? extends Serializable> rootTask : rootTasks) { GenMapRedUtils.setKeyAndValueDescForTaskTree(rootTask); } // If a task contains an operator which instructs bucketizedhiveinputformat // to be used, please do so for (Task<? extends Serializable> rootTask : rootTasks) { setInputFormat(rootTask); } optimizeTaskPlan(rootTasks, pCtx, ctx); decideExecMode(rootTasks, ctx, globalLimitCtx); if (pCtx.getQueryProperties().isCTAS()) { // generate a DDL task and make it a dependent task of the leaf CreateTableDesc crtTblDesc = pCtx.getCreateTable(); crtTblDesc.validate(conf); // clear the mapredWork output file from outputs for CTAS // DDLWork at the tail of the chain will have the output Iterator<WriteEntity> outIter = outputs.iterator(); while (outIter.hasNext()) { switch (outIter.next().getType()) { case DFS_DIR: case LOCAL_DIR: outIter.remove(); break; default: break; } } Task<? extends Serializable> crtTblTask = TaskFactory.get(new DDLWork(inputs, outputs, crtTblDesc), conf); // find all leaf tasks and make the DDLTask as a dependent task of all of // them HashSet<Task<? extends Serializable>> leaves = new LinkedHashSet<Task<? extends Serializable>>(); getLeafTasks(rootTasks, leaves); assert (leaves.size() > 0); for (Task<? extends Serializable> task : leaves) { if (task instanceof StatsTask) { // StatsTask require table to already exist for (Task<? extends Serializable> parentOfStatsTask : task.getParentTasks()) { parentOfStatsTask.addDependentTask(crtTblTask); } for (Task<? extends Serializable> parentOfCrtTblTask : crtTblTask.getParentTasks()) { parentOfCrtTblTask.removeDependentTask(task); } crtTblTask.addDependentTask(task); } else { task.addDependentTask(crtTblTask); } } } if (globalLimitCtx.isEnable() && pCtx.getFetchTask() != null) { LOG.info("set least row check for FetchTask: " + globalLimitCtx.getGlobalLimit()); pCtx.getFetchTask().getWork().setLeastNumRows(globalLimitCtx.getGlobalLimit()); } if (globalLimitCtx.isEnable() && globalLimitCtx.getLastReduceLimitDesc() != null) { LOG.info("set least row check for LimitDesc: " + globalLimitCtx.getGlobalLimit()); globalLimitCtx.getLastReduceLimitDesc().setLeastRows(globalLimitCtx.getGlobalLimit()); List<ExecDriver> mrTasks = Utilities.getMRTasks(rootTasks); for (ExecDriver tsk : mrTasks) { tsk.setRetryCmdWhenFail(true); } List<SparkTask> sparkTasks = Utilities.getSparkTasks(rootTasks); for (SparkTask sparkTask : sparkTasks) { sparkTask.setRetryCmdWhenFail(true); } } Interner<TableDesc> interner = Interners.newStrongInterner(); for (Task<? extends Serializable> rootTask : rootTasks) { GenMapRedUtils.internTableDesc(rootTask, interner); } } /** * A helper function to generate a column stats task on top of map-red task. The column stats * task fetches from the output of the map-red task, constructs the column stats object and * persists it to the metastore. * * This method generates a plan with a column stats task on top of map-red task and sets up the * appropriate metadata to be used during execution. * * @param qb */ @SuppressWarnings("unchecked") protected void genColumnStatsTask(AnalyzeRewriteContext analyzeRewrite, List<LoadTableDesc> loadTableWork, List<LoadFileDesc> loadFileWork, List<Task<? extends Serializable>> rootTasks, int outerQueryLimit) { ColumnStatsTask cStatsTask = null; ColumnStatsWork cStatsWork = null; FetchWork fetch = null; String tableName = analyzeRewrite.getTableName(); List<String> colName = analyzeRewrite.getColName(); List<String> colType = analyzeRewrite.getColType(); boolean isTblLevel = analyzeRewrite.isTblLvl(); String cols = loadFileWork.get(0).getColumns(); String colTypes = loadFileWork.get(0).getColumnTypes(); String resFileFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYRESULTFILEFORMAT); TableDesc resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, resFileFormat); fetch = new FetchWork(loadFileWork.get(0).getSourcePath(), resultTab, outerQueryLimit); ColumnStatsDesc cStatsDesc = new ColumnStatsDesc(tableName, colName, colType, isTblLevel); cStatsWork = new ColumnStatsWork(fetch, cStatsDesc); cStatsTask = (ColumnStatsTask) TaskFactory.get(cStatsWork, conf); // This is a column stats task. According to the semantic, there should be // only one MR task in the rootTask. rootTasks.get(0).addDependentTask(cStatsTask); } /** * Find all leaf tasks of the list of root tasks. */ protected void getLeafTasks(List<Task<? extends Serializable>> rootTasks, HashSet<Task<? extends Serializable>> leaves) { for (Task<? extends Serializable> root : rootTasks) { getLeafTasks(root, leaves); } } private void getLeafTasks(Task<? extends Serializable> task, HashSet<Task<? extends Serializable>> leaves) { if (task.getDependentTasks() == null) { if (!leaves.contains(task)) { leaves.add(task); } } else { getLeafTasks(task.getDependentTasks(), leaves); } } /* * Called to transform tasks into local tasks where possible/desirable */ protected abstract void decideExecMode(List<Task<? extends Serializable>> rootTasks, Context ctx, GlobalLimitCtx globalLimitCtx) throws SemanticException; /* * Called at the beginning of the compile phase to have another chance to optimize the operator plan */ protected void optimizeOperatorPlan(ParseContext pCtxSet, Set<ReadEntity> inputs, Set<WriteEntity> outputs) throws SemanticException { } /* * Called after the tasks have been generated to run another round of optimization */ protected abstract void optimizeTaskPlan(List<Task<? extends Serializable>> rootTasks, ParseContext pCtx, Context ctx) throws SemanticException; /* * Called to set the appropriate input format for tasks */ protected abstract void setInputFormat(Task<? extends Serializable> rootTask); /* * Called to generate the taks tree from the parse context/operator tree */ protected abstract void generateTaskTree(List<Task<? extends Serializable>> rootTasks, ParseContext pCtx, List<Task<MoveWork>> mvTask, Set<ReadEntity> inputs, Set<WriteEntity> outputs) throws SemanticException; /** * Create a clone of the parse context */ public ParseContext getParseContext(ParseContext pCtx, List<Task<? extends Serializable>> rootTasks) { ParseContext clone = new ParseContext(conf, pCtx.getOpToPartPruner(), pCtx.getOpToPartList(), pCtx.getTopOps(), pCtx.getJoinOps(), pCtx.getSmbMapJoinOps(), pCtx.getLoadTableWork(), pCtx.getLoadFileWork(), pCtx.getContext(), pCtx.getIdToTableNameMap(), pCtx.getDestTableId(), pCtx.getUCtx(), pCtx.getListMapJoinOpsNoReducer(), pCtx.getPrunedPartitions(), pCtx.getOpToSamplePruner(), pCtx.getGlobalLimitCtx(), pCtx.getNameToSplitSample(), pCtx.getSemanticInputs(), rootTasks, pCtx.getOpToPartToSkewedPruner(), pCtx.getViewAliasToInput(), pCtx.getReduceSinkOperatorsAddedByEnforceBucketingSorting(), pCtx.getAnalyzeRewrite(), pCtx.getCreateTable(), pCtx.getQueryProperties()); clone.setFetchTask(pCtx.getFetchTask()); clone.setLineageInfo(pCtx.getLineageInfo()); clone.setMapJoinOps(pCtx.getMapJoinOps()); return clone; } }