Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.parse.spark; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Stack; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.Context; import org.apache.hadoop.hive.ql.exec.ConditionalTask; import org.apache.hadoop.hive.ql.exec.DummyStoreOperator; import org.apache.hadoop.hive.ql.exec.FileSinkOperator; import org.apache.hadoop.hive.ql.exec.FilterOperator; import org.apache.hadoop.hive.ql.exec.JoinOperator; import org.apache.hadoop.hive.ql.exec.MapJoinOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.UnionOperator; import org.apache.hadoop.hive.ql.exec.spark.SparkTask; import org.apache.hadoop.hive.ql.exec.spark.SparkUtilities; import org.apache.hadoop.hive.ql.hooks.ReadEntity; import org.apache.hadoop.hive.ql.hooks.WriteEntity; import org.apache.hadoop.hive.ql.lib.CompositeProcessor; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; import org.apache.hadoop.hive.ql.lib.ForwardWalker; import org.apache.hadoop.hive.ql.lib.GraphWalker; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.lib.Rule; import org.apache.hadoop.hive.ql.lib.RuleRegExp; import org.apache.hadoop.hive.ql.lib.TypeRule; import org.apache.hadoop.hive.ql.log.PerfLogger; import org.apache.hadoop.hive.ql.optimizer.ConstantPropagate; import org.apache.hadoop.hive.ql.optimizer.DynamicPartitionPruningOptimization; import org.apache.hadoop.hive.ql.optimizer.SparkRemoveDynamicPruningBySize; import org.apache.hadoop.hive.ql.optimizer.metainfo.annotation.AnnotateWithOpTraits; import org.apache.hadoop.hive.ql.optimizer.physical.MetadataOnlyOptimizer; import org.apache.hadoop.hive.ql.optimizer.physical.NullScanOptimizer; import org.apache.hadoop.hive.ql.optimizer.physical.PhysicalContext; import org.apache.hadoop.hive.ql.optimizer.physical.SparkCrossProductCheck; import org.apache.hadoop.hive.ql.optimizer.physical.SparkMapJoinResolver; import org.apache.hadoop.hive.ql.optimizer.physical.StageIDsRearranger; import org.apache.hadoop.hive.ql.optimizer.physical.Vectorizer; import org.apache.hadoop.hive.ql.optimizer.spark.CombineEquivalentWorkResolver; import org.apache.hadoop.hive.ql.optimizer.spark.SetSparkReducerParallelism; import org.apache.hadoop.hive.ql.optimizer.spark.SparkJoinHintOptimizer; import org.apache.hadoop.hive.ql.optimizer.spark.SparkJoinOptimizer; import org.apache.hadoop.hive.ql.optimizer.spark.SparkReduceSinkMapJoinProc; import org.apache.hadoop.hive.ql.optimizer.spark.SparkSkewJoinResolver; import org.apache.hadoop.hive.ql.optimizer.spark.SplitSparkWorkResolver; import org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics; import org.apache.hadoop.hive.ql.parse.GlobalLimitCtx; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.parse.TaskCompiler; import org.apache.hadoop.hive.ql.plan.BaseWork; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.MoveWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.SparkWork; /** * SparkCompiler translates the operator plan into SparkTasks. * * Cloned from TezCompiler. */ public class SparkCompiler extends TaskCompiler { private static final String CLASS_NAME = SparkCompiler.class.getName(); private static final PerfLogger PERF_LOGGER = PerfLogger.getPerfLogger(); private static final Log LOGGER = LogFactory.getLog(SparkCompiler.class); public SparkCompiler() { } @Override protected void optimizeOperatorPlan(ParseContext pCtx, Set<ReadEntity> inputs, Set<WriteEntity> outputs) throws SemanticException { PERF_LOGGER.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_OPTIMIZE_OPERATOR_TREE); OptimizeSparkProcContext procCtx = new OptimizeSparkProcContext(conf, pCtx, inputs, outputs); // Run Spark Dynamic Partition Pruning runDynamicPartitionPruning(procCtx); // Annotation OP tree with statistics runStatsAnnotation(procCtx); // Run Join releated optimizations runJoinOptimizations(procCtx); PERF_LOGGER.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_OPTIMIZE_OPERATOR_TREE); } private void runStatsAnnotation(OptimizeSparkProcContext procCtx) throws SemanticException { new AnnotateWithStatistics().transform(procCtx.getParseContext()); new AnnotateWithOpTraits().transform(procCtx.getParseContext()); } private void runDynamicPartitionPruning(OptimizeSparkProcContext procCtx) throws SemanticException { if (!conf.getBoolVar(HiveConf.ConfVars.SPARK_DYNAMIC_PARTITION_PRUNING)) { return; } ParseContext parseContext = procCtx.getParseContext(); Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); opRules.put(new RuleRegExp(new String("Dynamic Partition Pruning"), FilterOperator.getOperatorName() + "%"), new DynamicPartitionPruningOptimization()); // The dispatcher fires the processor corresponding to the closest matching // rule and passes the context along Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx); GraphWalker ogw = new ForwardWalker(disp); List<Node> topNodes = new ArrayList<Node>(); topNodes.addAll(parseContext.getTopOps().values()); ogw.startWalking(topNodes, null); // need a new run of the constant folding because we might have created lots // of "and true and true" conditions. if (procCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVEOPTCONSTANTPROPAGATION)) { new ConstantPropagate().transform(parseContext); } } private void runJoinOptimizations(OptimizeSparkProcContext procCtx) throws SemanticException { ParseContext pCtx = procCtx.getParseContext(); Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); opRules.put(new RuleRegExp("Set parallelism - ReduceSink", ReduceSinkOperator.getOperatorName() + "%"), new SetSparkReducerParallelism()); opRules.put(new TypeRule(JoinOperator.class), new SparkJoinOptimizer(pCtx)); opRules.put(new TypeRule(MapJoinOperator.class), new SparkJoinHintOptimizer(pCtx)); opRules.put( new RuleRegExp("Disabling Dynamic Partition Pruning By Size", SparkPartitionPruningSinkOperator.getOperatorName() + "%"), new SparkRemoveDynamicPruningBySize()); // The dispatcher fires the processor corresponding to the closest matching // rule and passes the context along Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx); GraphWalker ogw = new DefaultGraphWalker(disp); // Create a list of topop nodes ArrayList<Node> topNodes = new ArrayList<Node>(); topNodes.addAll(pCtx.getTopOps().values()); ogw.startWalking(topNodes, null); } /** * TODO: need to turn on rules that's commented out and add more if necessary. */ @Override protected void generateTaskTree(List<Task<? extends Serializable>> rootTasks, ParseContext pCtx, List<Task<MoveWork>> mvTask, Set<ReadEntity> inputs, Set<WriteEntity> outputs) throws SemanticException { PERF_LOGGER.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_GENERATE_TASK_TREE); GenSparkUtils utils = GenSparkUtils.getUtils(); utils.resetSequenceNumber(); ParseContext tempParseContext = getParseContext(pCtx, rootTasks); GenSparkProcContext procCtx = new GenSparkProcContext(conf, tempParseContext, mvTask, rootTasks, inputs, outputs, pCtx.getTopOps()); // -------------------------------- First Pass ---------------------------------- // // Identify SparkPartitionPruningSinkOperators, and break OP tree if necessary Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); opRules.put(new RuleRegExp("Clone OP tree for PartitionPruningSink", SparkPartitionPruningSinkOperator.getOperatorName() + "%"), new SplitOpTreeForDPP()); Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx); GraphWalker ogw = new GenSparkWorkWalker(disp, procCtx); List<Node> topNodes = new ArrayList<Node>(); topNodes.addAll(pCtx.getTopOps().values()); ogw.startWalking(topNodes, null); // -------------------------------- Second Pass ---------------------------------- // // Process operator tree in two steps: first we process the extra op trees generated // in the first pass. Then we process the main op tree, and the result task will depend // on the task generated in the first pass. topNodes.clear(); topNodes.addAll(procCtx.topOps.values()); generateTaskTreeHelper(procCtx, topNodes); // If this set is not empty, it means we need to generate a separate task for collecting // the partitions used. if (!procCtx.clonedPruningTableScanSet.isEmpty()) { SparkTask pruningTask = SparkUtilities.createSparkTask(conf); SparkTask mainTask = procCtx.currentTask; pruningTask.addDependentTask(procCtx.currentTask); procCtx.rootTasks.remove(procCtx.currentTask); procCtx.rootTasks.add(pruningTask); procCtx.currentTask = pruningTask; topNodes.clear(); topNodes.addAll(procCtx.clonedPruningTableScanSet); generateTaskTreeHelper(procCtx, topNodes); procCtx.currentTask = mainTask; } // -------------------------------- Post Pass ---------------------------------- // // we need to clone some operator plans and remove union operators still for (BaseWork w : procCtx.workWithUnionOperators) { GenSparkUtils.getUtils().removeUnionOperators(conf, procCtx, w); } // we need to fill MapWork with 'local' work and bucket information for SMB Join. GenSparkUtils.getUtils().annotateMapWork(procCtx); // finally make sure the file sink operators are set up right for (FileSinkOperator fileSink : procCtx.fileSinkSet) { GenSparkUtils.getUtils().processFileSink(procCtx, fileSink); } // Process partition pruning sinks for (Operator<?> prunerSink : procCtx.pruningSinkSet) { utils.processPartitionPruningSink(procCtx, (SparkPartitionPruningSinkOperator) prunerSink); } PERF_LOGGER.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_GENERATE_TASK_TREE); } private void generateTaskTreeHelper(GenSparkProcContext procCtx, List<Node> topNodes) throws SemanticException { // create a walker which walks the tree in a DFS manner while maintaining // the operator stack. The dispatcher generates the plan from the operator tree Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); GenSparkWork genSparkWork = new GenSparkWork(GenSparkUtils.getUtils()); opRules.put(new RuleRegExp("Split Work - ReduceSink", ReduceSinkOperator.getOperatorName() + "%"), genSparkWork); opRules.put(new RuleRegExp("Split Work - SparkPartitionPruningSink", SparkPartitionPruningSinkOperator.getOperatorName() + "%"), genSparkWork); opRules.put(new TypeRule(MapJoinOperator.class), new SparkReduceSinkMapJoinProc()); opRules.put(new RuleRegExp("Split Work + Move/Merge - FileSink", FileSinkOperator.getOperatorName() + "%"), new CompositeProcessor(new SparkFileSinkProcessor(), genSparkWork)); opRules.put(new RuleRegExp("Handle Analyze Command", TableScanOperator.getOperatorName() + "%"), new SparkProcessAnalyzeTable(GenSparkUtils.getUtils())); opRules.put(new RuleRegExp("Remember union", UnionOperator.getOperatorName() + "%"), new NodeProcessor() { @Override public Object process(Node n, Stack<Node> s, NodeProcessorCtx procCtx, Object... os) throws SemanticException { GenSparkProcContext context = (GenSparkProcContext) procCtx; UnionOperator union = (UnionOperator) n; // simply need to remember that we've seen a union. context.currentUnionOperators.add(union); return null; } }); /** * SMB join case: (Big) (Small) (Small) * TS TS TS * \ | / * \ DS DS * \ | / * SMBJoinOP * * Some of the other processors are expecting only one traversal beyond SMBJoinOp. * We need to traverse from the big-table path only, and stop traversing on the * small-table path once we reach SMBJoinOp. * Also add some SMB join information to the context, so we can properly annotate * the MapWork later on. */ opRules.put(new TypeRule(SMBMapJoinOperator.class), new NodeProcessor() { @Override public Object process(Node currNode, Stack<Node> stack, NodeProcessorCtx procCtx, Object... os) throws SemanticException { GenSparkProcContext context = (GenSparkProcContext) procCtx; SMBMapJoinOperator currSmbNode = (SMBMapJoinOperator) currNode; SparkSMBMapJoinInfo smbMapJoinCtx = context.smbMapJoinCtxMap.get(currSmbNode); if (smbMapJoinCtx == null) { smbMapJoinCtx = new SparkSMBMapJoinInfo(); context.smbMapJoinCtxMap.put(currSmbNode, smbMapJoinCtx); } for (Node stackNode : stack) { if (stackNode instanceof DummyStoreOperator) { //If coming from small-table side, do some book-keeping, and skip traversal. smbMapJoinCtx.smallTableRootOps.add(context.currentRootOperator); return true; } } //If coming from big-table side, do some book-keeping, and continue traversal smbMapJoinCtx.bigTableRootOp = context.currentRootOperator; return false; } }); // The dispatcher fires the processor corresponding to the closest matching // rule and passes the context along Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx); GraphWalker ogw = new GenSparkWorkWalker(disp, procCtx); ogw.startWalking(topNodes, null); } @Override protected void setInputFormat(Task<? extends Serializable> task) { if (task instanceof SparkTask) { SparkWork work = ((SparkTask) task).getWork(); List<BaseWork> all = work.getAllWork(); for (BaseWork w : all) { if (w instanceof MapWork) { MapWork mapWork = (MapWork) w; HashMap<String, Operator<? extends OperatorDesc>> opMap = mapWork.getAliasToWork(); if (!opMap.isEmpty()) { for (Operator<? extends OperatorDesc> op : opMap.values()) { setInputFormat(mapWork, op); } } } } } else if (task instanceof ConditionalTask) { List<Task<? extends Serializable>> listTasks = ((ConditionalTask) task).getListTasks(); for (Task<? extends Serializable> tsk : listTasks) { setInputFormat(tsk); } } if (task.getChildTasks() != null) { for (Task<? extends Serializable> childTask : task.getChildTasks()) { setInputFormat(childTask); } } } private void setInputFormat(MapWork work, Operator<? extends OperatorDesc> op) { if (op.isUseBucketizedHiveInputFormat()) { work.setUseBucketizedHiveInputFormat(true); return; } if (op.getChildOperators() != null) { for (Operator<? extends OperatorDesc> childOp : op.getChildOperators()) { setInputFormat(work, childOp); } } } @Override protected void decideExecMode(List<Task<? extends Serializable>> rootTasks, Context ctx, GlobalLimitCtx globalLimitCtx) throws SemanticException { // currently all Spark work is on the cluster return; } @Override protected void optimizeTaskPlan(List<Task<? extends Serializable>> rootTasks, ParseContext pCtx, Context ctx) throws SemanticException { PERF_LOGGER.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_OPTIMIZE_TASK_TREE); PhysicalContext physicalCtx = new PhysicalContext(conf, pCtx, pCtx.getContext(), rootTasks, pCtx.getFetchTask()); physicalCtx = new SplitSparkWorkResolver().resolve(physicalCtx); if (conf.getBoolVar(HiveConf.ConfVars.HIVESKEWJOIN)) { (new SparkSkewJoinResolver()).resolve(physicalCtx); } else { LOG.debug("Skipping runtime skew join optimization"); } physicalCtx = new SparkMapJoinResolver().resolve(physicalCtx); if (conf.getBoolVar(HiveConf.ConfVars.HIVENULLSCANOPTIMIZE)) { physicalCtx = new NullScanOptimizer().resolve(physicalCtx); } else { LOG.debug("Skipping null scan query optimization"); } if (conf.getBoolVar(HiveConf.ConfVars.HIVEMETADATAONLYQUERIES)) { physicalCtx = new MetadataOnlyOptimizer().resolve(physicalCtx); } else { LOG.debug("Skipping metadata only query optimization"); } if (conf.getBoolVar(HiveConf.ConfVars.HIVE_CHECK_CROSS_PRODUCT)) { physicalCtx = new SparkCrossProductCheck().resolve(physicalCtx); } else { LOG.debug("Skipping cross product analysis"); } if (conf.getBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED)) { (new Vectorizer()).resolve(physicalCtx); } else { LOG.debug("Skipping vectorization"); } if (!"none".equalsIgnoreCase(conf.getVar(HiveConf.ConfVars.HIVESTAGEIDREARRANGE))) { (new StageIDsRearranger()).resolve(physicalCtx); } else { LOG.debug("Skipping stage id rearranger"); } new CombineEquivalentWorkResolver().resolve(physicalCtx); PERF_LOGGER.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_OPTIMIZE_TASK_TREE); return; } }