Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tajo.engine.planner.global; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.tajo.ExecutionBlockId; import org.apache.tajo.SessionVars; import org.apache.tajo.algebra.JoinType; import org.apache.tajo.catalog.*; import org.apache.tajo.catalog.partition.PartitionMethodDesc; import org.apache.tajo.catalog.proto.CatalogProtos; import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.conf.TajoConf; import org.apache.tajo.engine.planner.global.builder.DistinctGroupbyBuilder; import org.apache.tajo.engine.planner.global.rewriter.GlobalPlanRewriteEngine; import org.apache.tajo.engine.planner.global.rewriter.GlobalPlanRewriteRuleProvider; import org.apache.tajo.engine.query.QueryContext; import org.apache.tajo.exception.NotImplementedException; import org.apache.tajo.exception.TajoException; import org.apache.tajo.exception.TajoInternalError; import org.apache.tajo.exception.UnsupportedException; import org.apache.tajo.plan.LogicalPlan; import org.apache.tajo.plan.Target; import org.apache.tajo.plan.expr.*; import org.apache.tajo.plan.logical.*; import org.apache.tajo.plan.rewrite.rules.ProjectionPushDownRule; import org.apache.tajo.plan.util.PlannerUtil; import org.apache.tajo.plan.visitor.BasicLogicalPlanVisitor; import org.apache.tajo.storage.StorageConstants; import org.apache.tajo.util.KeyValueSet; import org.apache.tajo.util.ReflectionUtil; import org.apache.tajo.util.TUtil; import org.apache.tajo.worker.TajoWorker; import java.io.IOException; import java.util.*; import static org.apache.tajo.conf.TajoConf.ConfVars; import static org.apache.tajo.conf.TajoConf.ConfVars.GLOBAL_PLAN_REWRITE_RULE_PROVIDER_CLASS; import static org.apache.tajo.plan.serder.PlanProto.ShuffleType.*; /** * Build DAG */ public class GlobalPlanner { private static Log LOG = LogFactory.getLog(GlobalPlanner.class); private final TajoConf conf; private final String dataFormat; private final String finalOutputDataFormat; private final CatalogService catalog; private final GlobalPlanRewriteEngine rewriteEngine; @VisibleForTesting public GlobalPlanner(final TajoConf conf, final CatalogService catalog) throws IOException { this.conf = conf; this.catalog = catalog; this.dataFormat = conf.getVar(ConfVars.SHUFFLE_FILE_FORMAT).toUpperCase(); this.finalOutputDataFormat = conf.getVar(ConfVars.QUERY_OUTPUT_DEFAULT_FILE_FORMAT).toUpperCase(); Class<? extends GlobalPlanRewriteRuleProvider> clazz = (Class<? extends GlobalPlanRewriteRuleProvider>) conf .getClassVar(GLOBAL_PLAN_REWRITE_RULE_PROVIDER_CLASS); GlobalPlanRewriteRuleProvider provider = ReflectionUtil.newInstance(clazz, conf); rewriteEngine = new GlobalPlanRewriteEngine(); rewriteEngine.addRewriteRule(provider.getRules()); } public GlobalPlanner(final TajoConf conf, final TajoWorker.WorkerContext workerContext) throws IOException { this(conf, workerContext.getCatalog()); } public TajoConf getConf() { return conf; } public CatalogService getCatalog() { return catalog; } public String getDataFormat() { return dataFormat; } public static class GlobalPlanContext { MasterPlan plan; Map<Integer, ExecutionBlock> execBlockMap = Maps.newHashMap(); public MasterPlan getPlan() { return plan; } public Map<Integer, ExecutionBlock> getExecBlockMap() { return execBlockMap; } } /** * Builds a master plan from the given logical plan. */ public void build(QueryContext queryContext, MasterPlan masterPlan) throws IOException, TajoException { DistributedPlannerVisitor planner = new DistributedPlannerVisitor(); GlobalPlanContext globalPlanContext = new GlobalPlanContext(); globalPlanContext.plan = masterPlan; LOG.info(masterPlan.getLogicalPlan()); // copy a logical plan in order to keep the original logical plan. The distributed planner can modify // an input logical plan. LogicalNode inputPlan = PlannerUtil.clone(masterPlan.getLogicalPlan(), masterPlan.getLogicalPlan().getRootBlock().getRoot()); // create a distributed execution plan by visiting each logical node. // Its output is a graph, where each vertex is an execution block, and each edge is a data channel. // MasterPlan contains them. LogicalNode lastNode = planner.visit(globalPlanContext, masterPlan.getLogicalPlan(), masterPlan.getLogicalPlan().getRootBlock(), inputPlan, new Stack<>()); ExecutionBlock childExecBlock = globalPlanContext.execBlockMap.get(lastNode.getPID()); ExecutionBlock terminalBlock; // TODO - consider two terminal types: specified output or not if (childExecBlock.getPlan() != null) { terminalBlock = masterPlan.createTerminalBlock(); DataChannel finalChannel = new DataChannel(childExecBlock.getId(), terminalBlock.getId()); setFinalOutputChannel(finalChannel, lastNode.getOutSchema()); masterPlan.addConnect(finalChannel); } else { // if one or more unions is terminal terminalBlock = childExecBlock; for (DataChannel outputChannel : masterPlan.getIncomingChannels(terminalBlock.getId())) { setFinalOutputChannel(outputChannel, lastNode.getOutSchema()); } } masterPlan.setTerminal(terminalBlock); LOG.info("\n\nNon-optimized master plan\n" + masterPlan.toString()); masterPlan = rewriteEngine.rewrite(queryContext, masterPlan); LOG.info("\n\nOptimized master plan\n" + masterPlan.toString()); } private void setFinalOutputChannel(DataChannel outputChannel, Schema outputSchema) { outputChannel.setShuffleType(NONE_SHUFFLE); outputChannel.setShuffleOutputNum(1); outputChannel.setDataFormat(finalOutputDataFormat); outputChannel.setSchema(outputSchema); } public static ScanNode buildInputExecutor(LogicalPlan plan, DataChannel channel) { Preconditions.checkArgument(channel.getSchema() != null, "Channel schema (" + channel.getSrcId().getId() + " -> " + channel.getTargetId().getId() + ") is not initialized"); TableMeta meta = new TableMeta(channel.getDataFormat(), new KeyValueSet()); TableDesc desc = new TableDesc(channel.getSrcId().toString(), channel.getSchema(), meta, StorageConstants.LOCAL_FS_URI); ScanNode scanNode = plan.createNode(ScanNode.class); scanNode.init(desc); return scanNode; } private DataChannel createDataChannelFromJoin(ExecutionBlock leftBlock, ExecutionBlock rightBlock, ExecutionBlock parent, JoinNode join, boolean leftTable) { ExecutionBlock childBlock = leftTable ? leftBlock : rightBlock; DataChannel channel = new DataChannel(childBlock, parent, HASH_SHUFFLE, 32); channel.setDataFormat(dataFormat); if (join.getJoinType() != JoinType.CROSS) { // ShuffleKeys need to not have thea-join condition because Tajo supports only equi-join. Column[][] joinColumns = PlannerUtil.joinJoinKeyForEachTable(join.getJoinQual(), leftBlock.getPlan().getOutSchema(), rightBlock.getPlan().getOutSchema(), false); if (leftTable) { channel.setShuffleKeys(joinColumns[0]); } else { channel.setShuffleKeys(joinColumns[1]); } } return channel; } private ExecutionBlock buildJoinPlan(GlobalPlanContext context, JoinNode joinNode, ExecutionBlock leftBlock, ExecutionBlock rightBlock) throws TajoException { MasterPlan masterPlan = context.plan; ExecutionBlock currentBlock; LogicalNode leftNode = joinNode.getLeftChild(); LogicalNode rightNode = joinNode.getRightChild(); // symmetric repartition join boolean leftUnion = leftNode.getType() == NodeType.TABLE_SUBQUERY && ((TableSubQueryNode) leftNode).getSubQuery().getType() == NodeType.UNION; boolean rightUnion = rightNode.getType() == NodeType.TABLE_SUBQUERY && ((TableSubQueryNode) rightNode).getSubQuery().getType() == NodeType.UNION; if (leftUnion || rightUnion) { // if one of child execution block is union /* Join with tableC and result of union tableA, tableB is expected the following physical plan. But Union execution block is not necessary. |-eb_0001_000006 (Terminal) |-eb_0001_000005 (Join eb_0001_000003, eb_0001_000004) |-eb_0001_000004 (Scan TableC) |-eb_0001_000003 (Union TableA, TableB) |-eb_0001_000002 (Scan TableB) |-eb_0001_000001 (Scan TableA) The above plan can be changed to the following plan. |-eb_0001_000005 (Terminal) |-eb_0001_000003 (Join [eb_0001_000001, eb_0001_000002], eb_0001_000004) |-eb_0001_000004 (Scan TableC) |-eb_0001_000002 (Scan TableB) |-eb_0001_000001 (Scan TableA) eb_0001_000003's left child should be eb_0001_000001 + eb_0001_000001 and right child should be eb_0001_000004. For this eb_0001_000001 is representative of eb_0001_000001, eb_0001_000002. So eb_0001_000003's left child is eb_0001_000001 */ Column[][] joinColumns = null; if (joinNode.getJoinType() != JoinType.CROSS) { // ShuffleKeys need to not have thea-join condition because Tajo supports only equi-join. joinColumns = PlannerUtil.joinJoinKeyForEachTable(joinNode.getJoinQual(), leftNode.getOutSchema(), rightNode.getOutSchema(), false); } if (leftUnion && !rightUnion) { // if only left is union currentBlock = leftBlock; context.execBlockMap.remove(leftNode.getPID()); Column[] shuffleKeys = (joinColumns != null) ? joinColumns[0] : null; Column[] otherSideShuffleKeys = (joinColumns != null) ? joinColumns[1] : null; buildJoinPlanWithUnionChannel(context, joinNode, currentBlock, leftBlock, rightBlock, leftNode, shuffleKeys, otherSideShuffleKeys, true); currentBlock.setPlan(joinNode); } else if (!leftUnion && rightUnion) { // if only right is union currentBlock = rightBlock; context.execBlockMap.remove(rightNode.getPID()); Column[] shuffleKeys = (joinColumns != null) ? joinColumns[1] : null; Column[] otherSideShuffleKeys = (joinColumns != null) ? joinColumns[0] : null; buildJoinPlanWithUnionChannel(context, joinNode, currentBlock, rightBlock, leftBlock, rightNode, shuffleKeys, otherSideShuffleKeys, false); currentBlock.setPlan(joinNode); } else { // if both are unions currentBlock = leftBlock; context.execBlockMap.remove(leftNode.getPID()); context.execBlockMap.remove(rightNode.getPID()); buildJoinPlanWithUnionChannel(context, joinNode, currentBlock, leftBlock, null, leftNode, (joinColumns != null ? joinColumns[0] : null), null, true); buildJoinPlanWithUnionChannel(context, joinNode, currentBlock, rightBlock, null, rightNode, (joinColumns != null ? joinColumns[1] : null), null, false); currentBlock.setPlan(joinNode); } return currentBlock; } else { // !leftUnion && !rightUnion currentBlock = masterPlan.newExecutionBlock(); DataChannel leftChannel = createDataChannelFromJoin(leftBlock, rightBlock, currentBlock, joinNode, true); DataChannel rightChannel = createDataChannelFromJoin(leftBlock, rightBlock, currentBlock, joinNode, false); ScanNode leftScan = buildInputExecutor(masterPlan.getLogicalPlan(), leftChannel); ScanNode rightScan = buildInputExecutor(masterPlan.getLogicalPlan(), rightChannel); joinNode.setLeftChild(leftScan); joinNode.setRightChild(rightScan); currentBlock.setPlan(joinNode); masterPlan.addConnect(leftChannel); masterPlan.addConnect(rightChannel); return currentBlock; } } private void buildJoinPlanWithUnionChannel(GlobalPlanContext context, JoinNode joinNode, ExecutionBlock targetBlock, ExecutionBlock sourceBlock, ExecutionBlock otherSideBlock, LogicalNode childNode, Column[] shuffleKeys, Column[] otherSideShuffleKeys, boolean left) { MasterPlan masterPlan = context.getPlan(); String subQueryRelationName = ((TableSubQueryNode) childNode).getCanonicalName(); ExecutionBlockId dedicatedScanNodeBlock = null; for (DataChannel channel : masterPlan.getIncomingChannels(sourceBlock.getId())) { // If all union and right, add channel to left if (otherSideBlock == null && !left) { DataChannel oldChannel = channel; masterPlan.disconnect(oldChannel.getSrcId(), oldChannel.getTargetId()); channel = new DataChannel(oldChannel.getSrcId(), targetBlock.getId()); } channel.setSchema(childNode.getOutSchema()); channel.setShuffleType(HASH_SHUFFLE); channel.setShuffleOutputNum(32); if (shuffleKeys != null) { channel.setShuffleKeys(shuffleKeys); } ScanNode scanNode = buildInputExecutor(masterPlan.getLogicalPlan(), channel); scanNode.getOutSchema().setQualifier(subQueryRelationName); if (dedicatedScanNodeBlock == null) { dedicatedScanNodeBlock = channel.getSrcId(); if (left) { joinNode.setLeftChild(scanNode); } else { joinNode.setRightChild(scanNode); } } masterPlan.addConnect(channel); targetBlock.addUnionScan(channel.getSrcId(), dedicatedScanNodeBlock); } // create other side channel if (otherSideBlock != null) { DataChannel otherSideChannel = new DataChannel(otherSideBlock, targetBlock, HASH_SHUFFLE, 32); otherSideChannel.setDataFormat(dataFormat); if (otherSideShuffleKeys != null) { otherSideChannel.setShuffleKeys(otherSideShuffleKeys); } masterPlan.addConnect(otherSideChannel); ScanNode scan = buildInputExecutor(masterPlan.getLogicalPlan(), otherSideChannel); if (left) { joinNode.setRightChild(scan); } else { joinNode.setLeftChild(scan); } } } private AggregationFunctionCallEval createSumFunction(EvalNode[] args) throws TajoException { FunctionDesc functionDesc = null; functionDesc = getCatalog().getFunction("sum", CatalogProtos.FunctionType.AGGREGATION, TypeConverter.convert(args[0].getValueType()).getDataType()); return new AggregationFunctionCallEval(functionDesc, args); } private AggregationFunctionCallEval createCountFunction(EvalNode[] args) throws TajoException { FunctionDesc functionDesc = getCatalog().getFunction("count", CatalogProtos.FunctionType.AGGREGATION, TypeConverter.convert(args[0].getValueType()).getDataType()); return new AggregationFunctionCallEval(functionDesc, args); } private AggregationFunctionCallEval createCountRowFunction(EvalNode[] args) throws TajoException { FunctionDesc functionDesc = getCatalog().getFunction("count", CatalogProtos.FunctionType.AGGREGATION, new TajoDataTypes.DataType[] {}); return new AggregationFunctionCallEval(functionDesc, args); } private AggregationFunctionCallEval createMaxFunction(EvalNode[] args) throws TajoException { FunctionDesc functionDesc = getCatalog().getFunction("max", CatalogProtos.FunctionType.AGGREGATION, TypeConverter.convert(args[0].getValueType()).getDataType()); return new AggregationFunctionCallEval(functionDesc, args); } private AggregationFunctionCallEval createMinFunction(EvalNode[] args) throws TajoException { FunctionDesc functionDesc = getCatalog().getFunction("min", CatalogProtos.FunctionType.AGGREGATION, TypeConverter.convert(args[0].getValueType()).getDataType()); return new AggregationFunctionCallEval(functionDesc, args); } /** * It contains transformed functions and it related data. * Each non-distinct function is transformed into two functions for both first and second stages. */ private static class RewrittenFunctions { AggregationFunctionCallEval[] firstStageEvals; List<Target> firstStageTargets; AggregationFunctionCallEval secondStageEvals; public RewrittenFunctions(int firstStageEvalNum) { firstStageEvals = new AggregationFunctionCallEval[firstStageEvalNum]; firstStageTargets = new ArrayList<>(); } } /** * Tajo uses three execution blocks for an aggregation operator including distinct aggregations. * We call this approach <i><b>three-phase aggregation</b></i>. * * In this case, non-distinct set functions (i.e., <code>count(1), sum(col1)</code>) should be rewritten * to other forms. Please see the following example. This is a rewriting case for a query which includes distinct * aggregation functions. In this example, <code>count(*)</code> functions are transformed into two * functions: count(*) in the inner query and sum() in the outer query. * * <h2>Original query</h2> * <pre> * SELECT * grp1, grp2, count(*) as total, count(distinct grp3) as distinct_col * from * rel1 * group by * grp1, grp2; * </pre> * * <h2>Rewritten query</h2> * <pre> * SELECT grp1, grp2, sum(cnt) as total, count(grp3) as distinct_col from ( * SELECT * grp1, grp2, grp3, count(*) as cnt * from * rel1 * group by * grp1, grp2, grp3) tmp1 * group by * grp1, grp2 * ) table1; * </pre> * * The main objective of this method is to transform non-distinct aggregation functions for three-phase aggregation. */ private RewrittenFunctions rewriteAggFunctionsForDistinctAggregation(GlobalPlanContext context, AggregationFunctionCallEval function) throws TajoException { LogicalPlan plan = context.plan.getLogicalPlan(); RewrittenFunctions rewritten = null; if (function.getName().equalsIgnoreCase("count")) { rewritten = new RewrittenFunctions(1); if (function.getArgs().length == 0) { rewritten.firstStageEvals[0] = createCountRowFunction(function.getArgs()); } else { rewritten.firstStageEvals[0] = createCountFunction(function.getArgs()); } String referenceName = plan.generateUniqueColumnName(rewritten.firstStageEvals[0]); FieldEval fieldEval = new FieldEval(referenceName, rewritten.firstStageEvals[0].getValueType()); rewritten.firstStageTargets.add(0, new Target(fieldEval)); rewritten.secondStageEvals = createSumFunction(new EvalNode[] { fieldEval }); } else if (function.getName().equalsIgnoreCase("sum")) { rewritten = new RewrittenFunctions(1); rewritten.firstStageEvals[0] = createSumFunction(function.getArgs()); String referenceName = plan.generateUniqueColumnName(rewritten.firstStageEvals[0]); FieldEval fieldEval = new FieldEval(referenceName, rewritten.firstStageEvals[0].getValueType()); rewritten.firstStageTargets.add(0, new Target(fieldEval)); rewritten.secondStageEvals = createSumFunction(new EvalNode[] { fieldEval }); } else if (function.getName().equals("max")) { rewritten = new RewrittenFunctions(1); rewritten.firstStageEvals[0] = createMaxFunction(function.getArgs()); String referenceName = plan.generateUniqueColumnName(rewritten.firstStageEvals[0]); FieldEval fieldEval = new FieldEval(referenceName, rewritten.firstStageEvals[0].getValueType()); rewritten.firstStageTargets.add(0, new Target(fieldEval)); rewritten.secondStageEvals = createMaxFunction(new EvalNode[] { fieldEval }); } else if (function.getName().equals("min")) { rewritten = new RewrittenFunctions(1); rewritten.firstStageEvals[0] = createMinFunction(function.getArgs()); String referenceName = plan.generateUniqueColumnName(rewritten.firstStageEvals[0]); FieldEval fieldEval = new FieldEval(referenceName, rewritten.firstStageEvals[0].getValueType()); rewritten.firstStageTargets.add(0, new Target(fieldEval)); rewritten.secondStageEvals = createMinFunction(new EvalNode[] { fieldEval }); } else { throw new UnsupportedException("a mix of other functions"); } return rewritten; } /** * If there are at least one distinct aggregation function, a query works as if the query is rewritten as follows: * * <h2>Original query</h2> * <pre> * SELECT * grp1, grp2, count(*) as total, count(distinct grp3) as distinct_col * from * rel1 * group by * grp1, grp2; * </pre> * * The query will work as if the query is rewritten into two queries as follows: * * <h2>Rewritten query</h2> * <pre> * SELECT grp1, grp2, sum(cnt) as total, count(grp3) as distinct_col from ( * SELECT * grp1, grp2, grp3, count(*) as cnt * from * rel1 * group by * grp1, grp2, grp3) tmp1 * group by * grp1, grp2 * ) table1; * </pre> * * In more detail, the first aggregation aggregates not only original grouping fields but also distinct columns. * Non-distinct aggregation functions should be transformed to proper functions. * Then, the second aggregation aggregates only original grouping fields with distinct aggregation functions and * transformed non-distinct aggregation functions. * * As a result, although a no-distinct aggregation requires two stages, a distinct aggregation requires three * execution blocks. */ private ExecutionBlock buildGroupByIncludingDistinctFunctionsMultiStage(GlobalPlanContext context, ExecutionBlock latestExecBlock, GroupbyNode groupbyNode) throws TajoException { Column[] originalGroupingColumns = groupbyNode.getGroupingColumns(); LinkedHashSet<Column> firstStageGroupingColumns = Sets .newLinkedHashSet(Arrays.asList(groupbyNode.getGroupingColumns())); List<AggregationFunctionCallEval> firstStageAggFunctions = Lists.newArrayList(); List<AggregationFunctionCallEval> secondPhaseEvalNodes = Lists.newArrayList(); List<Target> firstPhaseEvalNodeTargets = Lists.newArrayList(); for (AggregationFunctionCallEval aggFunction : groupbyNode.getAggFunctions()) { if (aggFunction.isDistinct()) { // add distinct columns to first stage's grouping columns firstStageGroupingColumns.addAll(EvalTreeUtil.findUniqueColumns(aggFunction)); // keep distinct aggregation functions for the second stage secondPhaseEvalNodes.add(aggFunction); } else { // Rewrite non-distinct aggregation functions RewrittenFunctions rewritten = rewriteAggFunctionsForDistinctAggregation(context, aggFunction); firstStageAggFunctions.addAll(Lists.newArrayList(rewritten.firstStageEvals)); firstPhaseEvalNodeTargets.addAll(Lists.newArrayList(rewritten.firstStageTargets)); // keep rewritten non-aggregation functions for the second stage secondPhaseEvalNodes.add(rewritten.secondStageEvals); } } List<Target> firstStageTargets = new ArrayList<>(); for (Column column : firstStageGroupingColumns) { firstStageTargets.add(new Target(new FieldEval(column))); } for (Target target : firstPhaseEvalNodeTargets) { firstStageTargets.add(target); } // Create the groupby node for the first stage and set all necessary descriptions GroupbyNode firstStageGroupby = new GroupbyNode(context.plan.getLogicalPlan().newPID()); firstStageGroupby.setGroupingColumns(TUtil.toArray(firstStageGroupingColumns, Column.class)); firstStageGroupby.setAggFunctions(firstStageAggFunctions); firstStageGroupby.setTargets(firstStageTargets); firstStageGroupby.setChild(groupbyNode.getChild()); firstStageGroupby.setInSchema(groupbyNode.getInSchema()); // Makes two execution blocks for the first stage ExecutionBlock firstStage = buildGroupBy(context, latestExecBlock, firstStageGroupby); // Create the groupby node for the second stage. GroupbyNode secondPhaseGroupby = new GroupbyNode(context.plan.getLogicalPlan().newPID()); secondPhaseGroupby.setGroupingColumns(originalGroupingColumns); secondPhaseGroupby.setAggFunctions(secondPhaseEvalNodes); secondPhaseGroupby.setTargets(groupbyNode.getTargets()); ExecutionBlock secondStage = context.plan.newExecutionBlock(); secondStage.setPlan(secondPhaseGroupby); SortSpec[] sortSpecs = PlannerUtil.columnsToSortSpecs(firstStageGroupingColumns); secondStage.getEnforcer().enforceSortAggregation(secondPhaseGroupby.getPID(), sortSpecs); // Create a data channel between the first and second stages DataChannel channel; channel = new DataChannel(firstStage, secondStage, HASH_SHUFFLE, 32); channel.setShuffleKeys(secondPhaseGroupby.getGroupingColumns().clone()); channel.setSchema(firstStage.getPlan().getOutSchema()); channel.setDataFormat(dataFormat); // Setting for the second phase's logical plan ScanNode scanNode = buildInputExecutor(context.plan.getLogicalPlan(), channel); secondPhaseGroupby.setChild(scanNode); secondPhaseGroupby.setInSchema(scanNode.getOutSchema()); secondStage.setPlan(secondPhaseGroupby); context.plan.addConnect(channel); return secondStage; } private ExecutionBlock buildGroupBy(GlobalPlanContext context, ExecutionBlock lastBlock, GroupbyNode groupbyNode) throws TajoException { MasterPlan masterPlan = context.plan; ExecutionBlock currentBlock; if (groupbyNode.isDistinct()) { // if there is at one distinct aggregation function boolean multiLevelEnabled = context.getPlan().getContext() .getBool(SessionVars.GROUPBY_MULTI_LEVEL_ENABLED); if (multiLevelEnabled) { if (PlannerUtil.findTopNode(groupbyNode, NodeType.UNION) == null) { DistinctGroupbyBuilder builder = new DistinctGroupbyBuilder(this); return builder.buildMultiLevelPlan(context, lastBlock, groupbyNode); } else { DistinctGroupbyBuilder builder = new DistinctGroupbyBuilder(this); return builder.buildPlan(context, lastBlock, groupbyNode); } } else { DistinctGroupbyBuilder builder = new DistinctGroupbyBuilder(this); return builder.buildPlan(context, lastBlock, groupbyNode); } } else { GroupbyNode firstPhaseGroupby = createFirstPhaseGroupBy(masterPlan.getLogicalPlan(), groupbyNode); if (hasUnionChild(firstPhaseGroupby)) { currentBlock = buildGroupbyAndUnionPlan(masterPlan, lastBlock, firstPhaseGroupby, groupbyNode); } else { // general hash-shuffled aggregation currentBlock = buildTwoPhaseGroupby(masterPlan, lastBlock, firstPhaseGroupby, groupbyNode); } } return currentBlock; } public static boolean hasUnionChild(UnaryNode node) { // there are three cases: // // The first case is: // // create table [tbname] as select * from ( select ... UNION select ...) T // // We can generalize this case as 'a store operator on the top of union'. // In this case, a store operator determines a shuffle method. // // The second case is: // // select avg(..) from (select ... UNION select ) T // // We can generalize this case as 'a shuffle required operator on the top of union'. // // The third case is: // // create table select * from ( select ... ) a union all select * from ( select ... ) b LogicalNode childNode = node.getChild(); if (childNode instanceof UnaryNode) { // first case UnaryNode child = (UnaryNode) childNode; if (child.getChild().getType() == NodeType.PROJECTION) { child = child.getChild(); } if (child.getChild().getType() == NodeType.TABLE_SUBQUERY) { TableSubQueryNode tableSubQuery = child.getChild(); return tableSubQuery.getSubQuery().getType() == NodeType.UNION; } } else if (childNode.getType() == NodeType.TABLE_SUBQUERY) { // second case TableSubQueryNode tableSubQuery = node.getChild(); return tableSubQuery.getSubQuery().getType() == NodeType.UNION; } else if (childNode.getType() == NodeType.UNION) { // third case return true; } return false; } private ExecutionBlock buildGroupbyAndUnionPlan(MasterPlan masterPlan, ExecutionBlock lastBlock, GroupbyNode firstPhaseGroupBy, GroupbyNode secondPhaseGroupBy) throws TajoException { DataChannel lastDataChannel = null; // It pushes down the first phase group-by operator into all child blocks. // // (second phase) G (currentBlock) // /|\ // / / | \ // (first phase) G G G G (child block) // They are already connected one another. // So, we don't need to connect them again. for (DataChannel dataChannel : masterPlan.getIncomingChannels(lastBlock.getId())) { if (firstPhaseGroupBy.isEmptyGrouping()) { dataChannel.setShuffle(HASH_SHUFFLE, firstPhaseGroupBy.getGroupingColumns(), 1); } else { dataChannel.setShuffle(HASH_SHUFFLE, firstPhaseGroupBy.getGroupingColumns(), 32); } dataChannel.setSchema(firstPhaseGroupBy.getOutSchema()); ExecutionBlock childBlock = masterPlan.getExecBlock(dataChannel.getSrcId()); // Why must firstPhaseGroupby be copied? // // A groupby in each execution block can have different child. // It affects groupby's input schema. GroupbyNode firstPhaseGroupbyCopy = PlannerUtil.clone(masterPlan.getLogicalPlan(), firstPhaseGroupBy); firstPhaseGroupbyCopy.setChild(childBlock.getPlan()); childBlock.setPlan(firstPhaseGroupbyCopy); // just keep the last data channel. lastDataChannel = dataChannel; } ScanNode scanNode = buildInputExecutor(masterPlan.getLogicalPlan(), lastDataChannel); secondPhaseGroupBy.setChild(scanNode); lastBlock.setPlan(secondPhaseGroupBy); return lastBlock; } private ExecutionBlock buildTwoPhaseGroupby(MasterPlan masterPlan, ExecutionBlock latestBlock, GroupbyNode firstPhaseGroupby, GroupbyNode secondPhaseGroupby) throws TajoException { ExecutionBlock childBlock = latestBlock; childBlock.setPlan(firstPhaseGroupby); ExecutionBlock currentBlock = masterPlan.newExecutionBlock(); DataChannel channel; if (firstPhaseGroupby.isEmptyGrouping()) { channel = new DataChannel(childBlock, currentBlock, HASH_SHUFFLE, 1); channel.setShuffleKeys(firstPhaseGroupby.getGroupingColumns()); } else { channel = new DataChannel(childBlock, currentBlock, HASH_SHUFFLE, 32); channel.setShuffleKeys(firstPhaseGroupby.getGroupingColumns()); } channel.setSchema(firstPhaseGroupby.getOutSchema()); channel.setDataFormat(dataFormat); ScanNode scanNode = buildInputExecutor(masterPlan.getLogicalPlan(), channel); secondPhaseGroupby.setChild(scanNode); secondPhaseGroupby.setInSchema(scanNode.getOutSchema()); currentBlock.setPlan(secondPhaseGroupby); masterPlan.addConnect(channel); return currentBlock; } public static GroupbyNode createFirstPhaseGroupBy(LogicalPlan plan, GroupbyNode groupBy) { Preconditions.checkNotNull(groupBy); GroupbyNode firstPhaseGroupBy = PlannerUtil.clone(plan, groupBy); GroupbyNode secondPhaseGroupBy = groupBy; // Set first phase expressions if (secondPhaseGroupBy.hasAggFunctions()) { int evalNum = secondPhaseGroupBy.getAggFunctions().size(); List<AggregationFunctionCallEval> secondPhaseEvals = secondPhaseGroupBy.getAggFunctions(); List<AggregationFunctionCallEval> firstPhaseEvals = new ArrayList<>(); String[] firstPhaseEvalNames = new String[evalNum]; for (int i = 0; i < evalNum; i++) { try { firstPhaseEvals.add((AggregationFunctionCallEval) secondPhaseEvals.get(i).clone()); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } firstPhaseEvals.get(i).setFirstPhase(); firstPhaseEvalNames[i] = plan.generateUniqueColumnName(firstPhaseEvals.get(i)); FieldEval param = new FieldEval(firstPhaseEvalNames[i], firstPhaseEvals.get(i).getValueType()); secondPhaseEvals.get(i).setLastPhase(); secondPhaseEvals.get(i).setArgs(new EvalNode[] { param }); } secondPhaseGroupBy.setAggFunctions(secondPhaseEvals); firstPhaseGroupBy.setAggFunctions(firstPhaseEvals); List<Target> firstPhaseTargets = ProjectionPushDownRule.buildGroupByTarget(firstPhaseGroupBy, null, firstPhaseEvalNames); firstPhaseGroupBy.setTargets(firstPhaseTargets); secondPhaseGroupBy.setInSchema(PlannerUtil.targetToSchema(firstPhaseTargets)); } return firstPhaseGroupBy; } private ExecutionBlock buildSortPlan(GlobalPlanContext context, ExecutionBlock childBlock, SortNode currentNode) throws TajoException { MasterPlan masterPlan = context.plan; ExecutionBlock currentBlock; SortNode firstSortNode = PlannerUtil.clone(context.plan.getLogicalPlan(), currentNode); if (firstSortNode.getChild().getType() == NodeType.TABLE_SUBQUERY && ((TableSubQueryNode) firstSortNode.getChild()).getSubQuery().getType() == NodeType.UNION) { currentBlock = childBlock; for (DataChannel channel : masterPlan.getIncomingChannels(childBlock.getId())) { channel.setShuffle(RANGE_SHUFFLE, PlannerUtil.sortSpecsToSchema(currentNode.getSortKeys()).toArray(), 32); channel.setSchema(firstSortNode.getOutSchema()); ExecutionBlock subBlock = masterPlan.getExecBlock(channel.getSrcId()); SortNode s1 = PlannerUtil.clone(context.plan.getLogicalPlan(), firstSortNode); s1.setChild(subBlock.getPlan()); subBlock.setPlan(s1); ScanNode secondScan = buildInputExecutor(masterPlan.getLogicalPlan(), channel); currentNode.setChild(secondScan); currentNode.setInSchema(secondScan.getOutSchema()); currentBlock.setPlan(currentNode); currentBlock.getEnforcer().addSortedInput(secondScan.getTableName(), currentNode.getSortKeys()); } } else { LogicalNode childBlockPlan = childBlock.getPlan(); firstSortNode.setChild(childBlockPlan); // sort is a non-projectable operator. So, in/out schemas are the same to its child operator. firstSortNode.setInSchema(childBlockPlan.getOutSchema()); firstSortNode.setOutSchema(childBlockPlan.getOutSchema()); childBlock.setPlan(firstSortNode); currentBlock = masterPlan.newExecutionBlock(); DataChannel channel = new DataChannel(childBlock, currentBlock, RANGE_SHUFFLE, 32); channel.setShuffleKeys(PlannerUtil.sortSpecsToSchema(currentNode.getSortKeys()).toArray()); channel.setSchema(firstSortNode.getOutSchema()); ScanNode secondScan = buildInputExecutor(masterPlan.getLogicalPlan(), channel); currentNode.setChild(secondScan); currentNode.setInSchema(secondScan.getOutSchema()); currentBlock.setPlan(currentNode); currentBlock.getEnforcer().addSortedInput(secondScan.getTableName(), currentNode.getSortKeys()); masterPlan.addConnect(channel); } return currentBlock; } /** * It builds a distributed execution block for CTAS, InsertNode, and StoreTableNode. */ private ExecutionBlock buildStorePlan(GlobalPlanContext context, ExecutionBlock lastBlock, StoreTableNode currentNode) throws TajoException { if (currentNode.hasPartition()) { // if a target table is a partitioned table // Verify supported partition types PartitionMethodDesc partitionMethod = currentNode.getPartitionMethod(); if (partitionMethod.getPartitionType() != CatalogProtos.PartitionType.COLUMN) { throw new NotImplementedException( "partition type '" + partitionMethod.getPartitionType().name() + "'"); } if (hasUnionChild(currentNode)) { // if it has union children return buildShuffleAndStorePlanToPartitionedTableWithUnion(context, currentNode, lastBlock); } else { // otherwise return buildShuffleAndStorePlanToPartitionedTable(context, currentNode, lastBlock); } } else { // if result table is not a partitioned table, directly store it return buildNoPartitionedStorePlan(context, currentNode, lastBlock); } } /** * It makes a plan to store directly union plans into a non-partitioned table. */ private ExecutionBlock buildShuffleAndStorePlanNoPartitionedTableWithUnion(GlobalPlanContext context, StoreTableNode currentNode, ExecutionBlock childBlock) throws TajoException { for (ExecutionBlock grandChildBlock : context.plan.getChilds(childBlock)) { StoreTableNode copy = PlannerUtil.clone(context.plan.getLogicalPlan(), currentNode); copy.setChild(grandChildBlock.getPlan()); grandChildBlock.setPlan(copy); } return childBlock; } /** * It inserts shuffle and adds store plan on a partitioned table, * and it push downs those plans into child unions. */ private ExecutionBlock buildShuffleAndStorePlanToPartitionedTableWithUnion(GlobalPlanContext context, StoreTableNode currentNode, ExecutionBlock lastBlock) throws TajoException { MasterPlan masterPlan = context.plan; DataChannel lastChannel = null; for (DataChannel channel : masterPlan.getIncomingChannels(lastBlock.getId())) { ExecutionBlock childBlock = masterPlan.getExecBlock(channel.getSrcId()); setShuffleKeysFromPartitionedTableStore(currentNode, channel); channel.setSchema(childBlock.getPlan().getOutSchema()); channel.setDataFormat(dataFormat); lastChannel = channel; } ScanNode scanNode = buildInputExecutor(masterPlan.getLogicalPlan(), lastChannel); currentNode.setChild(scanNode); currentNode.setInSchema(scanNode.getOutSchema()); lastBlock.setPlan(currentNode); return lastBlock; } /** * It inserts shuffle and adds store plan on a partitioned table. */ private ExecutionBlock buildShuffleAndStorePlanToPartitionedTable(GlobalPlanContext context, StoreTableNode currentNode, ExecutionBlock lastBlock) throws TajoException { MasterPlan masterPlan = context.plan; ExecutionBlock nextBlock = masterPlan.newExecutionBlock(); DataChannel channel = new DataChannel(lastBlock, nextBlock, HASH_SHUFFLE, 32); setShuffleKeysFromPartitionedTableStore(currentNode, channel); channel.setSchema(lastBlock.getPlan().getOutSchema()); channel.setDataFormat(dataFormat); ScanNode scanNode = buildInputExecutor(masterPlan.getLogicalPlan(), channel); currentNode.setChild(scanNode); currentNode.setInSchema(scanNode.getOutSchema()); nextBlock.setPlan(currentNode); masterPlan.addConnect(channel); return nextBlock; } private ExecutionBlock buildNoPartitionedStorePlan(GlobalPlanContext context, StoreTableNode currentNode, ExecutionBlock childBlock) throws TajoException { if (hasUnionChild(currentNode)) { // when the below is union return buildShuffleAndStorePlanNoPartitionedTableWithUnion(context, currentNode, childBlock); } else { currentNode.setChild(childBlock.getPlan()); currentNode.setInSchema(childBlock.getPlan().getOutSchema()); childBlock.setPlan(currentNode); return childBlock; } } private void setShuffleKeysFromPartitionedTableStore(StoreTableNode node, DataChannel channel) { Preconditions.checkState(node.hasTargetTable(), "A target table must be a partitioned table."); PartitionMethodDesc partitionMethod = node.getPartitionMethod(); if (node.getType() == NodeType.INSERT || node.getType() == NodeType.CREATE_TABLE) { Schema tableSchema = null, projectedSchema = null; if (node.getType() == NodeType.INSERT) { tableSchema = ((InsertNode) node).getTableSchema(); projectedSchema = ((InsertNode) node).getProjectedSchema(); } else { tableSchema = node.getOutSchema(); projectedSchema = node.getInSchema(); } channel.setSchema(projectedSchema); Column[] shuffleKeys = new Column[partitionMethod.getExpressionSchema().size()]; int i = 0, id = 0; for (Column column : partitionMethod.getExpressionSchema().getRootColumns()) { if (node.getType() == NodeType.INSERT) { id = tableSchema.getColumnId(column.getQualifiedName()); } else { id = tableSchema.getRootColumns().size() + i; } shuffleKeys[i++] = projectedSchema.getColumn(id); } channel.setShuffleKeys(shuffleKeys); channel.setShuffleType(SCATTERED_HASH_SHUFFLE); } else { channel.setShuffleKeys(partitionMethod.getExpressionSchema().toArray()); channel.setShuffleType(HASH_SHUFFLE); } channel.setShuffleOutputNum(32); } public class DistributedPlannerVisitor extends BasicLogicalPlanVisitor<GlobalPlanContext, LogicalNode> { @Override public LogicalNode visitRoot(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block, LogicalRootNode node, Stack<LogicalNode> stack) throws TajoException { return super.visitRoot(context, plan, block, node, stack); } @Override public LogicalNode visitProjection(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block, ProjectionNode node, Stack<LogicalNode> stack) throws TajoException { LogicalNode child = super.visitProjection(context, plan, block, node, stack); ExecutionBlock execBlock = context.execBlockMap.remove(child.getPID()); if (child.getType() == NodeType.TABLE_SUBQUERY && ((TableSubQueryNode) child).getSubQuery().getType() == NodeType.UNION) { MasterPlan masterPlan = context.plan; for (DataChannel dataChannel : masterPlan.getIncomingChannels(execBlock.getId())) { dataChannel.setDataFormat(finalOutputDataFormat); ExecutionBlock subBlock = masterPlan.getExecBlock(dataChannel.getSrcId()); ProjectionNode copy = PlannerUtil.clone(plan, node); copy.setChild(subBlock.getPlan()); subBlock.setPlan(copy); } execBlock.setPlan(null); } else { node.setChild(execBlock.getPlan()); node.setInSchema(execBlock.getPlan().getOutSchema()); execBlock.setPlan(node); } context.execBlockMap.put(node.getPID(), execBlock); return node; } @Override public LogicalNode visitLimit(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block, LimitNode node, Stack<LogicalNode> stack) throws TajoException { LogicalNode child = super.visitLimit(context, plan, block, node, stack); ExecutionBlock execBlock; execBlock = context.execBlockMap.remove(child.getPID()); if (child.getType() == NodeType.SORT) { node.setChild(execBlock.getPlan()); execBlock.setPlan(node); ExecutionBlock childBlock = context.plan.getChild(execBlock, 0); LimitNode childLimit = PlannerUtil.clone(context.plan.getLogicalPlan(), node); childLimit.setChild(childBlock.getPlan()); childBlock.setPlan(childLimit); DataChannel channel = context.plan.getChannel(childBlock, execBlock); channel.setShuffleOutputNum(1); context.execBlockMap.put(node.getPID(), execBlock); } else { node.setChild(execBlock.getPlan()); execBlock.setPlan(node); ExecutionBlock newExecBlock = context.plan.newExecutionBlock(); DataChannel newChannel = new DataChannel(execBlock, newExecBlock, HASH_SHUFFLE, 1); newChannel.setShuffleKeys(new Column[] {}); newChannel.setSchema(node.getOutSchema()); newChannel.setDataFormat(dataFormat); ScanNode scanNode = buildInputExecutor(plan, newChannel); LimitNode parentLimit = PlannerUtil.clone(context.plan.getLogicalPlan(), node); parentLimit.setChild(scanNode); newExecBlock.setPlan(parentLimit); context.plan.addConnect(newChannel); context.execBlockMap.put(parentLimit.getPID(), newExecBlock); node = parentLimit; } return node; } @Override public LogicalNode visitSort(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block, SortNode node, Stack<LogicalNode> stack) throws TajoException { LogicalNode child = super.visitSort(context, plan, block, node, stack); ExecutionBlock childBlock = context.execBlockMap.remove(child.getPID()); ExecutionBlock newExecBlock = buildSortPlan(context, childBlock, node); context.execBlockMap.put(node.getPID(), newExecBlock); return node; } @Override public LogicalNode visitHaving(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block, HavingNode node, Stack<LogicalNode> stack) throws TajoException { LogicalNode child = super.visitHaving(context, plan, block, node, stack); // Don't separate execution block. Having is pushed to the second grouping execution block. ExecutionBlock childBlock = context.execBlockMap.remove(child.getPID()); node.setChild(childBlock.getPlan()); childBlock.setPlan(node); context.execBlockMap.put(node.getPID(), childBlock); return node; } @Override public LogicalNode visitWindowAgg(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block, WindowAggNode node, Stack<LogicalNode> stack) throws TajoException { LogicalNode child = super.visitWindowAgg(context, plan, block, node, stack); ExecutionBlock childBlock = context.execBlockMap.remove(child.getPID()); ExecutionBlock newExecBlock = buildWindowAgg(context, childBlock, node); context.execBlockMap.put(newExecBlock.getPlan().getPID(), newExecBlock); return newExecBlock.getPlan(); } private ExecutionBlock buildWindowAgg(GlobalPlanContext context, ExecutionBlock lastBlock, WindowAggNode windowAgg) throws TajoException { MasterPlan masterPlan = context.plan; ExecutionBlock childBlock = lastBlock; ExecutionBlock currentBlock = masterPlan.newExecutionBlock(); DataChannel channel; if (windowAgg.hasPartitionKeys()) { // if there is at one distinct aggregation function channel = new DataChannel(childBlock, currentBlock, RANGE_SHUFFLE, 32); channel.setShuffleKeys(windowAgg.getPartitionKeys()); } else { channel = new DataChannel(childBlock, currentBlock, HASH_SHUFFLE, 1); channel.setShuffleKeys(null); } channel.setSchema(windowAgg.getInSchema()); channel.setDataFormat(dataFormat); LogicalNode childNode = windowAgg.getChild(); ScanNode scanNode = buildInputExecutor(masterPlan.getLogicalPlan(), channel); if (windowAgg.hasPartitionKeys()) { SortNode sortNode = masterPlan.getLogicalPlan().createNode(SortNode.class); sortNode.setOutSchema(scanNode.getOutSchema()); sortNode.setInSchema(scanNode.getOutSchema()); sortNode.setSortSpecs(PlannerUtil.columnsToSortSpecs(windowAgg.getPartitionKeys())); sortNode.setChild(childNode); childBlock.setPlan(sortNode); windowAgg.setChild(scanNode); } else { windowAgg.setInSchema(scanNode.getOutSchema()); windowAgg.setChild(scanNode); childBlock.setPlan(childNode); } currentBlock.setPlan(windowAgg); context.plan.addConnect(channel); return currentBlock; } @Override public LogicalNode visitGroupBy(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block, GroupbyNode node, Stack<LogicalNode> stack) throws TajoException { LogicalNode child = super.visitGroupBy(context, plan, block, node, stack); ExecutionBlock childBlock = context.execBlockMap.remove(child.getPID()); ExecutionBlock newExecBlock = buildGroupBy(context, childBlock, node); context.execBlockMap.put(newExecBlock.getPlan().getPID(), newExecBlock); return newExecBlock.getPlan(); } @Override public LogicalNode visitFilter(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block, SelectionNode node, Stack<LogicalNode> stack) throws TajoException { LogicalNode child = super.visitFilter(context, plan, block, node, stack); ExecutionBlock execBlock = context.execBlockMap.remove(child.getPID()); node.setChild(execBlock.getPlan()); node.setInSchema(execBlock.getPlan().getOutSchema()); execBlock.setPlan(node); context.execBlockMap.put(node.getPID(), execBlock); return node; } @Override public LogicalNode visitJoin(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block, JoinNode node, Stack<LogicalNode> stack) throws TajoException { LogicalNode leftChild = visit(context, plan, block, node.getLeftChild(), stack); ExecutionBlock leftChildBlock = context.execBlockMap.get(leftChild.getPID()); LogicalNode rightChild = visit(context, plan, block, node.getRightChild(), stack); ExecutionBlock rightChildBlock = context.execBlockMap.get(rightChild.getPID()); if (node.getJoinType() == JoinType.LEFT_OUTER) { leftChildBlock.setPreservedRow(); rightChildBlock.setNullSuppllying(); } else if (node.getJoinType() == JoinType.RIGHT_OUTER) { leftChildBlock.setNullSuppllying(); rightChildBlock.setPreservedRow(); } else if (node.getJoinType() == JoinType.FULL_OUTER) { leftChildBlock.setPreservedRow(); leftChildBlock.setNullSuppllying(); rightChildBlock.setPreservedRow(); rightChildBlock.setNullSuppllying(); } ExecutionBlock newExecBlock = buildJoinPlan(context, node, leftChildBlock, rightChildBlock); context.execBlockMap.put(node.getPID(), newExecBlock); return node; } @Override public LogicalNode visitUnion(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock, UnionNode node, Stack<LogicalNode> stack) throws TajoException { stack.push(node); LogicalPlan.QueryBlock leftQueryBlock = plan.getBlock(node.getLeftChild()); LogicalNode leftChild = visit(context, plan, leftQueryBlock, leftQueryBlock.getRoot(), stack); LogicalPlan.QueryBlock rightQueryBlock = plan.getBlock(node.getRightChild()); LogicalNode rightChild = visit(context, plan, rightQueryBlock, rightQueryBlock.getRoot(), stack); stack.pop(); MasterPlan masterPlan = context.getPlan(); List<ExecutionBlock> unionBlocks = Lists.newArrayList(); List<ExecutionBlock> queryBlockBlocks = Lists.newArrayList(); ExecutionBlock leftBlock = context.execBlockMap.remove(leftChild.getPID()); ExecutionBlock rightBlock = context.execBlockMap.remove(rightChild.getPID()); // These union types need to eliminate unnecessary nodes between parent and child node of query tree. boolean leftUnion = (leftChild.getType() == NodeType.UNION) || ((leftChild.getType() == NodeType.TABLE_SUBQUERY) && (((TableSubQueryNode) leftChild).getSubQuery().getType() == NodeType.UNION)); boolean rightUnion = (rightChild.getType() == NodeType.UNION) || (rightChild.getType() == NodeType.TABLE_SUBQUERY) && (((TableSubQueryNode) rightChild).getSubQuery().getType() == NodeType.UNION); if (leftUnion) { unionBlocks.add(leftBlock); } else { queryBlockBlocks.add(leftBlock); } if (rightUnion) { unionBlocks.add(rightBlock); } else { queryBlockBlocks.add(rightBlock); } ExecutionBlock execBlock; if (unionBlocks.size() == 0) { execBlock = context.plan.newExecutionBlock(); } else { execBlock = unionBlocks.get(0); } for (ExecutionBlock childBlocks : unionBlocks) { for (ExecutionBlock grandChildBlock : masterPlan.getChilds(childBlocks)) { masterPlan.disconnect(grandChildBlock, childBlocks); queryBlockBlocks.add(grandChildBlock); } } for (ExecutionBlock childBlocks : queryBlockBlocks) { DataChannel channel = new DataChannel(childBlocks, execBlock, NONE_SHUFFLE, 1); channel.setDataFormat(dataFormat); masterPlan.addConnect(channel); } context.execBlockMap.put(node.getPID(), execBlock); return node; } private LogicalNode handleUnaryNode(GlobalPlanContext context, LogicalNode child, LogicalNode node) throws TajoException { ExecutionBlock execBlock = context.execBlockMap.remove(child.getPID()); execBlock.setPlan(node); context.execBlockMap.put(node.getPID(), execBlock); return node; } @Override public LogicalNode visitExcept(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock, ExceptNode node, Stack<LogicalNode> stack) throws TajoException { LogicalNode child = super.visitExcept(context, plan, queryBlock, node, stack); return handleUnaryNode(context, child, node); } @Override public LogicalNode visitIntersect(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock, IntersectNode node, Stack<LogicalNode> stack) throws TajoException { LogicalNode child = super.visitIntersect(context, plan, queryBlock, node, stack); return handleUnaryNode(context, child, node); } @Override public LogicalNode visitTableSubQuery(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock, TableSubQueryNode node, Stack<LogicalNode> stack) throws TajoException { LogicalNode child = super.visitTableSubQuery(context, plan, queryBlock, node, stack); node.setSubQuery(child); ExecutionBlock currentBlock = context.execBlockMap.remove(child.getPID()); if (child.getType() == NodeType.UNION) { List<TableSubQueryNode> addedTableSubQueries = new ArrayList<>(); TableSubQueryNode leftMostSubQueryNode = null; for (ExecutionBlock childBlock : context.plan.getChilds(currentBlock.getId())) { TableSubQueryNode copy = PlannerUtil.clone(plan, node); copy.setSubQuery(childBlock.getPlan()); childBlock.setPlan(copy); addedTableSubQueries.add(copy); //Find a SubQueryNode which contains all columns in InputSchema matched with Target and OutputSchema's column if (copy.getInSchema().containsAll(copy.getOutSchema().getRootColumns())) { for (Target eachTarget : copy.getTargets()) { Set<Column> columns = EvalTreeUtil.findUniqueColumns(eachTarget.getEvalTree()); if (copy.getInSchema().containsAll(columns)) { leftMostSubQueryNode = copy; break; } } } } if (leftMostSubQueryNode != null) { // replace target column name List<Target> targets = leftMostSubQueryNode.getTargets(); int[] targetMappings = new int[targets.size()]; for (int i = 0; i < targets.size(); i++) { if (targets.get(i).getEvalTree().getType() != EvalType.FIELD) { throw new TajoInternalError("Target of a UnionNode's subquery should be FieldEval."); } int index = leftMostSubQueryNode.getInSchema() .getColumnId(targets.get(i).getNamedColumn().getQualifiedName()); if (index < 0) { // If a target has alias, getNamedColumn() only returns alias Set<Column> columns = EvalTreeUtil.findUniqueColumns(targets.get(i).getEvalTree()); Column column = columns.iterator().next(); index = leftMostSubQueryNode.getInSchema().getColumnId(column.getQualifiedName()); } if (index < 0) { throw new TajoInternalError("Can't find matched Target in UnionNode's input schema: " + targets.get(i) + "->" + leftMostSubQueryNode.getInSchema()); } targetMappings[i] = index; } for (TableSubQueryNode eachNode : addedTableSubQueries) { if (eachNode.getPID() == leftMostSubQueryNode.getPID()) { continue; } List<Target> eachNodeTargets = eachNode.getTargets(); if (eachNodeTargets.size() != targetMappings.length) { throw new TajoInternalError( "Union query can't have different number of target columns."); } for (int i = 0; i < eachNodeTargets.size(); i++) { Column inColumn = eachNode.getInSchema().getColumn(targetMappings[i]); Target t = eachNodeTargets.get(i); t.setAlias(t.getNamedColumn().getQualifiedName()); EvalNode evalNode = eachNodeTargets.get(i).getEvalTree(); if (evalNode.getType() != EvalType.FIELD) { throw new TajoInternalError( "Target of a UnionNode's subquery should be FieldEval."); } FieldEval fieldEval = (FieldEval) evalNode; EvalTreeUtil.changeColumnRef(fieldEval, fieldEval.getColumnRef().getQualifiedName(), inColumn.getQualifiedName()); } } } else { LOG.warn("Can't find left most SubQuery in the UnionNode."); } } else { currentBlock.setPlan(node); } context.execBlockMap.put(node.getPID(), currentBlock); return node; } @Override public LogicalNode visitScan(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock, ScanNode node, Stack<LogicalNode> stack) throws TajoException { ExecutionBlock newExecBlock = context.plan.newExecutionBlock(); newExecBlock.setPlan(node); context.execBlockMap.put(node.getPID(), newExecBlock); return node; } @Override public LogicalNode visitIndexScan(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block, IndexScanNode node, Stack<LogicalNode> stack) throws TajoException { ExecutionBlock newBlock = context.plan.newExecutionBlock(); newBlock.setPlan(node); context.execBlockMap.put(node.getPID(), newBlock); return node; } @Override public LogicalNode visitPartitionedTableScan(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block, PartitionedTableScanNode node, Stack<LogicalNode> stack) throws TajoException { ExecutionBlock newExecBlock = context.plan.newExecutionBlock(); newExecBlock.setPlan(node); context.execBlockMap.put(node.getPID(), newExecBlock); return node; } @Override public LogicalNode visitStoreTable(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock, StoreTableNode node, Stack<LogicalNode> stack) throws TajoException { LogicalNode child = super.visitStoreTable(context, plan, queryBlock, node, stack); ExecutionBlock childBlock = context.execBlockMap.remove(child.getPID()); ExecutionBlock newExecBlock = buildStorePlan(context, childBlock, node); context.execBlockMap.put(node.getPID(), newExecBlock); return node; } @Override public LogicalNode visitCreateTable(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock, CreateTableNode node, Stack<LogicalNode> stack) throws TajoException { LogicalNode child = super.visitStoreTable(context, plan, queryBlock, node, stack); ExecutionBlock childBlock = context.execBlockMap.remove(child.getPID()); ExecutionBlock newExecBlock = buildStorePlan(context, childBlock, node); context.execBlockMap.put(node.getPID(), newExecBlock); return node; } @Override public LogicalNode visitInsert(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock, InsertNode node, Stack<LogicalNode> stack) throws TajoException { LogicalNode child = super.visitInsert(context, plan, queryBlock, node, stack); ExecutionBlock childBlock = context.execBlockMap.remove(child.getPID()); ExecutionBlock newExecBlock = buildStorePlan(context, childBlock, node); context.execBlockMap.put(node.getPID(), newExecBlock); return node; } @Override public LogicalNode visitCreateIndex(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock, CreateIndexNode node, Stack<LogicalNode> stack) throws TajoException { LogicalNode child = super.visitCreateIndex(context, plan, queryBlock, node, stack); // Don't separate execution block. CreateIndex is pushed to the first execution block. ExecutionBlock childBlock = context.execBlockMap.remove(child.getPID()); node.setChild(childBlock.getPlan()); childBlock.setPlan(node); context.execBlockMap.put(node.getPID(), childBlock); return node; } } }