org.apache.hadoop.hive.ql.optimizer.ConvertJoinMapJoin.java Source code

Introduction

Here is the source code for org.apache.hadoop.hive.ql.optimizer.ConvertJoinMapJoin.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.optimizer;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.common.JavaUtils;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.AppMasterEventOperator;
import org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator;
import org.apache.hadoop.hive.ql.exec.DummyStoreOperator;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
import org.apache.hadoop.hive.ql.exec.JoinOperator;
import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
import org.apache.hadoop.hive.ql.exec.MuxOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorFactory;
import org.apache.hadoop.hive.ql.exec.OperatorUtils;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.CommonMergeJoinDesc;
import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.JoinCondDesc;
import org.apache.hadoop.hive.ql.plan.JoinDesc;
import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
import org.apache.hadoop.hive.ql.plan.OpTraits;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.Statistics;
import org.apache.hadoop.util.ReflectionUtils;

/**
 * ConvertJoinMapJoin is an optimization that replaces a common join
 * (aka shuffle join) with a map join (aka broadcast or fragment replicate
 * join when possible. Map joins have restrictions on which joins can be
 * converted (e.g.: full outer joins cannot be handled as map joins) as well
 * as memory restrictions (one side of the join has to fit into memory).
 */
public class ConvertJoinMapJoin implements NodeProcessor {

    static final private Log LOG = LogFactory.getLog(ConvertJoinMapJoin.class.getName());

    @Override
    /*
     * (non-Javadoc) we should ideally not modify the tree we traverse. However,
     * since we need to walk the tree at any time when we modify the operator, we
     * might as well do it here.
     */
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs)
            throws SemanticException {

        OptimizeTezProcContext context = (OptimizeTezProcContext) procCtx;

        JoinOperator joinOp = (JoinOperator) nd;

        TezBucketJoinProcCtx tezBucketJoinProcCtx = new TezBucketJoinProcCtx(context.conf);
        if (!context.conf.getBoolVar(HiveConf.ConfVars.HIVECONVERTJOIN)) {
            // we are just converting to a common merge join operator. The shuffle
            // join in map-reduce case.
            Object retval = checkAndConvertSMBJoin(context, joinOp, tezBucketJoinProcCtx);
            if (retval == null) {
                return retval;
            } else {
                fallbackToReduceSideJoin(joinOp, context);
            }
        }

        // if we have traits, and table info is present in the traits, we know the
        // exact number of buckets. Else choose the largest number of estimated
        // reducers from the parent operators.
        int numBuckets = -1;
        if (context.conf.getBoolVar(HiveConf.ConfVars.HIVE_CONVERT_JOIN_BUCKET_MAPJOIN_TEZ)) {
            numBuckets = estimateNumBuckets(joinOp, true);
        } else {
            numBuckets = 1;
        }
        LOG.info("Estimated number of buckets " + numBuckets);
        int mapJoinConversionPos = getMapJoinConversionPos(joinOp, context, numBuckets);
        if (mapJoinConversionPos < 0) {
            Object retval = checkAndConvertSMBJoin(context, joinOp, tezBucketJoinProcCtx);
            if (retval == null) {
                return retval;
            } else {
                // only case is full outer join with SMB enabled which is not possible. Convert to regular
                // join.
                fallbackToReduceSideJoin(joinOp, context);
                return null;
            }
        }

        if (numBuckets > 1) {
            if (context.conf.getBoolVar(HiveConf.ConfVars.HIVE_CONVERT_JOIN_BUCKET_MAPJOIN_TEZ)) {
                if (convertJoinBucketMapJoin(joinOp, context, mapJoinConversionPos, tezBucketJoinProcCtx)) {
                    return null;
                }
            }
        }

        LOG.info("Convert to non-bucketed map join");
        // check if we can convert to map join no bucket scaling.
        mapJoinConversionPos = getMapJoinConversionPos(joinOp, context, 1);
        if (mapJoinConversionPos < 0) {
            // we are just converting to a common merge join operator. The shuffle
            // join in map-reduce case.
            fallbackToReduceSideJoin(joinOp, context);
            return null;
        }

        MapJoinOperator mapJoinOp = convertJoinMapJoin(joinOp, context, mapJoinConversionPos, true);
        // map join operator by default has no bucket cols and num of reduce sinks
        // reduced by 1
        mapJoinOp.setOpTraits(new OpTraits(null, -1, null));
        mapJoinOp.setStatistics(joinOp.getStatistics());
        // propagate this change till the next RS
        for (Operator<? extends OperatorDesc> childOp : mapJoinOp.getChildOperators()) {
            setAllChildrenTraits(childOp, mapJoinOp.getOpTraits());
        }

        return null;
    }

    @SuppressWarnings("unchecked")
    private Object checkAndConvertSMBJoin(OptimizeTezProcContext context, JoinOperator joinOp,
            TezBucketJoinProcCtx tezBucketJoinProcCtx) throws SemanticException {
        // we cannot convert to bucket map join, we cannot convert to
        // map join either based on the size. Check if we can convert to SMB join.
        if (context.conf.getBoolVar(HiveConf.ConfVars.HIVE_AUTO_SORTMERGE_JOIN) == false) {
            fallbackToReduceSideJoin(joinOp, context);
            return null;
        }
        Class<? extends BigTableSelectorForAutoSMJ> bigTableMatcherClass = null;
        try {
            String selector = HiveConf.getVar(context.parseContext.getConf(),
                    HiveConf.ConfVars.HIVE_AUTO_SORTMERGE_JOIN_BIGTABLE_SELECTOR);
            bigTableMatcherClass = JavaUtils.loadClass(selector);
        } catch (ClassNotFoundException e) {
            throw new SemanticException(e.getMessage());
        }

        BigTableSelectorForAutoSMJ bigTableMatcher = ReflectionUtils.newInstance(bigTableMatcherClass, null);
        JoinDesc joinDesc = joinOp.getConf();
        JoinCondDesc[] joinCondns = joinDesc.getConds();
        Set<Integer> joinCandidates = MapJoinProcessor.getBigTableCandidates(joinCondns);
        if (joinCandidates.isEmpty()) {
            // This is a full outer join. This can never be a map-join
            // of any type. So return false.
            return false;
        }
        int mapJoinConversionPos = bigTableMatcher.getBigTablePosition(context.parseContext, joinOp,
                joinCandidates);
        if (mapJoinConversionPos < 0) {
            // contains aliases from sub-query
            // we are just converting to a common merge join operator. The shuffle
            // join in map-reduce case.
            fallbackToReduceSideJoin(joinOp, context);
            return null;
        }

        if (checkConvertJoinSMBJoin(joinOp, context, mapJoinConversionPos, tezBucketJoinProcCtx)) {
            convertJoinSMBJoin(joinOp, context, mapJoinConversionPos, tezBucketJoinProcCtx.getNumBuckets(), true);
        } else {
            // we are just converting to a common merge join operator. The shuffle
            // join in map-reduce case.
            fallbackToReduceSideJoin(joinOp, context);
        }
        return null;
    }

    // replaces the join operator with a new CommonJoinOperator, removes the
    // parent reduce sinks
    private void convertJoinSMBJoin(JoinOperator joinOp, OptimizeTezProcContext context, int mapJoinConversionPos,
            int numBuckets, boolean adjustParentsChildren) throws SemanticException {
        MapJoinDesc mapJoinDesc = null;
        if (adjustParentsChildren) {
            mapJoinDesc = MapJoinProcessor.getMapJoinDesc(context.conf, joinOp, joinOp.getConf().isLeftInputJoin(),
                    joinOp.getConf().getBaseSrc(), joinOp.getConf().getMapAliases(), mapJoinConversionPos, true);
        } else {
            JoinDesc joinDesc = joinOp.getConf();
            // retain the original join desc in the map join.
            mapJoinDesc = new MapJoinDesc(MapJoinProcessor
                    .getKeys(joinOp.getConf().isLeftInputJoin(), joinOp.getConf().getBaseSrc(), joinOp).getSecond(),
                    null, joinDesc.getExprs(), null, null, joinDesc.getOutputColumnNames(), mapJoinConversionPos,
                    joinDesc.getConds(), joinDesc.getFilters(), joinDesc.getNoOuterJoin(), null);
            mapJoinDesc.setNullSafes(joinDesc.getNullSafes());
            mapJoinDesc.setFilterMap(joinDesc.getFilterMap());
            mapJoinDesc.resetOrder();
        }

        CommonMergeJoinOperator mergeJoinOp = (CommonMergeJoinOperator) OperatorFactory
                .get(new CommonMergeJoinDesc(numBuckets, mapJoinConversionPos, mapJoinDesc), joinOp.getSchema());
        OpTraits opTraits = new OpTraits(joinOp.getOpTraits().getBucketColNames(), numBuckets,
                joinOp.getOpTraits().getSortCols());
        mergeJoinOp.setOpTraits(opTraits);
        mergeJoinOp.setStatistics(joinOp.getStatistics());

        for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
            int pos = parentOp.getChildOperators().indexOf(joinOp);
            parentOp.getChildOperators().remove(pos);
            parentOp.getChildOperators().add(pos, mergeJoinOp);
        }

        for (Operator<? extends OperatorDesc> childOp : joinOp.getChildOperators()) {
            int pos = childOp.getParentOperators().indexOf(joinOp);
            childOp.getParentOperators().remove(pos);
            childOp.getParentOperators().add(pos, mergeJoinOp);
        }

        List<Operator<? extends OperatorDesc>> childOperators = mergeJoinOp.getChildOperators();
        List<Operator<? extends OperatorDesc>> parentOperators = mergeJoinOp.getParentOperators();

        childOperators.clear();
        parentOperators.clear();
        childOperators.addAll(joinOp.getChildOperators());
        parentOperators.addAll(joinOp.getParentOperators());
        mergeJoinOp.getConf().setGenJoinKeys(false);

        if (adjustParentsChildren) {
            mergeJoinOp.getConf().setGenJoinKeys(true);
            List<Operator<? extends OperatorDesc>> newParentOpList = new ArrayList<Operator<? extends OperatorDesc>>();
            for (Operator<? extends OperatorDesc> parentOp : mergeJoinOp.getParentOperators()) {
                for (Operator<? extends OperatorDesc> grandParentOp : parentOp.getParentOperators()) {
                    grandParentOp.getChildOperators().remove(parentOp);
                    grandParentOp.getChildOperators().add(mergeJoinOp);
                    newParentOpList.add(grandParentOp);
                }
            }
            mergeJoinOp.getParentOperators().clear();
            mergeJoinOp.getParentOperators().addAll(newParentOpList);
            List<Operator<? extends OperatorDesc>> parentOps = new ArrayList<Operator<? extends OperatorDesc>>(
                    mergeJoinOp.getParentOperators());
            for (Operator<? extends OperatorDesc> parentOp : parentOps) {
                int parentIndex = mergeJoinOp.getParentOperators().indexOf(parentOp);
                if (parentIndex == mapJoinConversionPos) {
                    continue;
                }

                // insert the dummy store operator here
                DummyStoreOperator dummyStoreOp = new TezDummyStoreOperator();
                dummyStoreOp.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>());
                dummyStoreOp.setChildOperators(new ArrayList<Operator<? extends OperatorDesc>>());
                dummyStoreOp.getChildOperators().add(mergeJoinOp);
                int index = parentOp.getChildOperators().indexOf(mergeJoinOp);
                parentOp.getChildOperators().remove(index);
                parentOp.getChildOperators().add(index, dummyStoreOp);
                dummyStoreOp.getParentOperators().add(parentOp);
                mergeJoinOp.getParentOperators().remove(parentIndex);
                mergeJoinOp.getParentOperators().add(parentIndex, dummyStoreOp);
            }
        }
        mergeJoinOp.cloneOriginalParentsList(mergeJoinOp.getParentOperators());
    }

    private void setAllChildrenTraits(Operator<? extends OperatorDesc> currentOp, OpTraits opTraits) {
        if (currentOp instanceof ReduceSinkOperator) {
            return;
        }
        currentOp.setOpTraits(
                new OpTraits(opTraits.getBucketColNames(), opTraits.getNumBuckets(), opTraits.getSortCols()));
        for (Operator<? extends OperatorDesc> childOp : currentOp.getChildOperators()) {
            if ((childOp instanceof ReduceSinkOperator) || (childOp instanceof GroupByOperator)) {
                break;
            }
            setAllChildrenTraits(childOp, opTraits);
        }
    }

    private boolean convertJoinBucketMapJoin(JoinOperator joinOp, OptimizeTezProcContext context,
            int bigTablePosition, TezBucketJoinProcCtx tezBucketJoinProcCtx) throws SemanticException {

        if (!checkConvertJoinBucketMapJoin(joinOp, context, bigTablePosition, tezBucketJoinProcCtx)) {
            LOG.info("Check conversion to bucket map join failed.");
            return false;
        }

        MapJoinOperator mapJoinOp = convertJoinMapJoin(joinOp, context, bigTablePosition, true);
        MapJoinDesc joinDesc = mapJoinOp.getConf();
        joinDesc.setBucketMapJoin(true);

        // we can set the traits for this join operator
        OpTraits opTraits = new OpTraits(joinOp.getOpTraits().getBucketColNames(),
                tezBucketJoinProcCtx.getNumBuckets(), null);
        mapJoinOp.setOpTraits(opTraits);
        mapJoinOp.setStatistics(joinOp.getStatistics());
        setNumberOfBucketsOnChildren(mapJoinOp);

        // Once the conversion is done, we can set the partitioner to bucket cols on the small table
        Map<String, Integer> bigTableBucketNumMapping = new HashMap<String, Integer>();
        bigTableBucketNumMapping.put(joinDesc.getBigTableAlias(), tezBucketJoinProcCtx.getNumBuckets());
        joinDesc.setBigTableBucketNumMapping(bigTableBucketNumMapping);

        return true;
    }

    /*
     * This method tries to convert a join to an SMB. This is done based on
     * traits. If the sorted by columns are the same as the join columns then, we
     * can convert the join to an SMB. Otherwise retain the bucket map join as it
     * is still more efficient than a regular join.
     */
    private boolean checkConvertJoinSMBJoin(JoinOperator joinOp, OptimizeTezProcContext context,
            int bigTablePosition, TezBucketJoinProcCtx tezBucketJoinProcCtx) throws SemanticException {

        ReduceSinkOperator bigTableRS = (ReduceSinkOperator) joinOp.getParentOperators().get(bigTablePosition);
        int numBuckets = bigTableRS.getParentOperators().get(0).getOpTraits().getNumBuckets();

        int size = -1;
        for (Operator<?> parentOp : joinOp.getParentOperators()) {
            // each side better have 0 or more RS. if either side is unbalanced, cannot convert.
            // This is a workaround for now. Right fix would be to refactor code in the
            // MapRecordProcessor and ReduceRecordProcessor with respect to the sources.
            Set<ReduceSinkOperator> set = OperatorUtils.findOperatorsUpstream(parentOp.getParentOperators(),
                    ReduceSinkOperator.class);
            if (size < 0) {
                size = set.size();
                continue;
            }

            if (((size > 0) && (set.size() > 0)) || ((size == 0) && (set.size() == 0))) {
                continue;
            } else {
                return false;
            }
        }

        // the sort and bucket cols have to match on both sides for this
        // transformation of the join operation
        for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
            if (!(parentOp instanceof ReduceSinkOperator)) {
                // could be mux/demux operators. Currently not supported
                LOG.info("Found correlation optimizer operators. Cannot convert to SMB at this time.");
                return false;
            }
            ReduceSinkOperator rsOp = (ReduceSinkOperator) parentOp;
            if (checkColEquality(rsOp.getParentOperators().get(0).getOpTraits().getSortCols(),
                    rsOp.getOpTraits().getSortCols(), rsOp.getColumnExprMap(), tezBucketJoinProcCtx) == false) {
                LOG.info("We cannot convert to SMB because the sort column names do not match.");
                return false;
            }

            if (checkColEquality(rsOp.getParentOperators().get(0).getOpTraits().getBucketColNames(),
                    rsOp.getOpTraits().getBucketColNames(), rsOp.getColumnExprMap(),
                    tezBucketJoinProcCtx) == false) {
                LOG.info("We cannot convert to SMB because bucket column names do not match.");
                return false;
            }
        }

        if (numBuckets < 0) {
            numBuckets = bigTableRS.getConf().getNumReducers();
        }
        tezBucketJoinProcCtx.setNumBuckets(numBuckets);
        LOG.info("We can convert the join to an SMB join.");
        return true;
    }

    private void setNumberOfBucketsOnChildren(Operator<? extends OperatorDesc> currentOp) {
        int numBuckets = currentOp.getOpTraits().getNumBuckets();
        for (Operator<? extends OperatorDesc> op : currentOp.getChildOperators()) {
            if (!(op instanceof ReduceSinkOperator) && !(op instanceof GroupByOperator)) {
                op.getOpTraits().setNumBuckets(numBuckets);
                setNumberOfBucketsOnChildren(op);
            }
        }
    }

    /*
     * If the parent reduce sink of the big table side has the same emit key cols as its parent, we
     * can create a bucket map join eliminating the reduce sink.
     */
    private boolean checkConvertJoinBucketMapJoin(JoinOperator joinOp, OptimizeTezProcContext context,
            int bigTablePosition, TezBucketJoinProcCtx tezBucketJoinProcCtx) throws SemanticException {
        // bail on mux-operator because mux operator masks the emit keys of the
        // constituent reduce sinks
        if (!(joinOp.getParentOperators().get(0) instanceof ReduceSinkOperator)) {
            LOG.info("Operator is " + joinOp.getParentOperators().get(0).getName()
                    + ". Cannot convert to bucket map join");
            return false;
        }

        ReduceSinkOperator rs = (ReduceSinkOperator) joinOp.getParentOperators().get(bigTablePosition);
        List<List<String>> parentColNames = rs.getOpTraits().getBucketColNames();
        Operator<? extends OperatorDesc> parentOfParent = rs.getParentOperators().get(0);
        List<List<String>> grandParentColNames = parentOfParent.getOpTraits().getBucketColNames();
        int numBuckets = parentOfParent.getOpTraits().getNumBuckets();
        // all keys matched.
        if (checkColEquality(grandParentColNames, parentColNames, rs.getColumnExprMap(),
                tezBucketJoinProcCtx) == false) {
            LOG.info("No info available to check for bucket map join. Cannot convert");
            return false;
        }

        /*
         * this is the case when the big table is a sub-query and is probably already bucketed by the
         * join column in say a group by operation
         */
        if (numBuckets < 0) {
            numBuckets = rs.getConf().getNumReducers();
        }
        tezBucketJoinProcCtx.setNumBuckets(numBuckets);
        return true;
    }

    private boolean checkColEquality(List<List<String>> grandParentColNames, List<List<String>> parentColNames,
            Map<String, ExprNodeDesc> colExprMap, TezBucketJoinProcCtx tezBucketJoinProcCtx) {

        if ((grandParentColNames == null) || (parentColNames == null)) {
            return false;
        }

        if ((parentColNames != null) && (parentColNames.isEmpty() == false)) {
            for (List<String> listBucketCols : grandParentColNames) {
                // can happen if this operator does not carry forward the previous bucketing columns
                // for e.g. another join operator which does not carry one of the sides' key columns
                if (listBucketCols.isEmpty()) {
                    continue;
                }
                int colCount = 0;
                // parent op is guaranteed to have a single list because it is a reduce sink
                for (String colName : parentColNames.get(0)) {
                    if (listBucketCols.size() <= colCount) {
                        // can happen with virtual columns. RS would add the column to its output columns
                        // but it would not exist in the grandparent output columns or exprMap.
                        return false;
                    }
                    // all columns need to be at least a subset of the parentOfParent's bucket cols
                    ExprNodeDesc exprNodeDesc = colExprMap.get(colName);
                    if (exprNodeDesc instanceof ExprNodeColumnDesc) {
                        if (((ExprNodeColumnDesc) exprNodeDesc).getColumn().equals(listBucketCols.get(colCount))) {
                            colCount++;
                        } else {
                            break;
                        }
                    }

                    if (colCount == parentColNames.get(0).size()) {
                        return true;
                    }
                }
            }
            return false;
        }
        return false;
    }

    public int getMapJoinConversionPos(JoinOperator joinOp, OptimizeTezProcContext context, int buckets)
            throws SemanticException {
        /*
         * HIVE-9038: Join tests fail in tez when we have more than 1 join on the same key and there is
         * an outer join down the join tree that requires filterTag. We disable this conversion to map
         * join here now. We need to emulate the behavior of HashTableSinkOperator as in MR or create a
         * new operation to be able to support this. This seems like a corner case enough to special
         * case this for now.
         */
        if (joinOp.getConf().getConds().length > 1) {
            boolean hasOuter = false;
            for (JoinCondDesc joinCondDesc : joinOp.getConf().getConds()) {
                switch (joinCondDesc.getType()) {
                case JoinDesc.INNER_JOIN:
                case JoinDesc.LEFT_SEMI_JOIN:
                case JoinDesc.UNIQUE_JOIN:
                    hasOuter = false;
                    break;

                case JoinDesc.FULL_OUTER_JOIN:
                case JoinDesc.LEFT_OUTER_JOIN:
                case JoinDesc.RIGHT_OUTER_JOIN:
                    hasOuter = true;
                    break;

                default:
                    throw new SemanticException("Unknown join type " + joinCondDesc.getType());
                }
            }
            if (hasOuter) {
                return -1;
            }
        }
        Set<Integer> bigTableCandidateSet = MapJoinProcessor.getBigTableCandidates(joinOp.getConf().getConds());

        long maxSize = context.conf.getLongVar(HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);

        int bigTablePosition = -1;

        Statistics bigInputStat = null;
        long totalSize = 0;
        int pos = 0;

        // bigTableFound means we've encountered a table that's bigger than the
        // max. This table is either the the big table or we cannot convert.
        boolean bigTableFound = false;

        for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {

            Statistics currInputStat = parentOp.getStatistics();
            if (currInputStat == null) {
                LOG.warn("Couldn't get statistics from: " + parentOp);
                return -1;
            }

            long inputSize = currInputStat.getDataSize();
            if ((bigInputStat == null) || ((bigInputStat != null) && (inputSize > bigInputStat.getDataSize()))) {

                if (bigTableFound) {
                    // cannot convert to map join; we've already chosen a big table
                    // on size and there's another one that's bigger.
                    return -1;
                }

                if (inputSize / buckets > maxSize) {
                    if (!bigTableCandidateSet.contains(pos)) {
                        // can't use the current table as the big table, but it's too
                        // big for the map side.
                        return -1;
                    }

                    bigTableFound = true;
                }

                if (bigInputStat != null) {
                    // we're replacing the current big table with a new one. Need
                    // to count the current one as a map table then.
                    totalSize += bigInputStat.getDataSize();
                }

                if (totalSize / buckets > maxSize) {
                    // sum of small tables size in this join exceeds configured limit
                    // hence cannot convert.
                    return -1;
                }

                if (bigTableCandidateSet.contains(pos)) {
                    bigTablePosition = pos;
                    bigInputStat = currInputStat;
                }
            } else {
                totalSize += currInputStat.getDataSize();
                if (totalSize / buckets > maxSize) {
                    // cannot hold all map tables in memory. Cannot convert.
                    return -1;
                }
            }
            pos++;
        }

        return bigTablePosition;
    }

    /*
     * Once we have decided on the map join, the tree would transform from
     *
     *        |                   |
     *       Join               MapJoin
     *       / \                /   \
     *     RS   RS   --->     RS    TS (big table)
     *    /      \           /
     *   TS       TS        TS (small table)
     *
     * for tez.
     */

    public MapJoinOperator convertJoinMapJoin(JoinOperator joinOp, OptimizeTezProcContext context,
            int bigTablePosition, boolean removeReduceSink) throws SemanticException {
        // bail on mux operator because currently the mux operator masks the emit keys
        // of the constituent reduce sinks.
        for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
            if (parentOp instanceof MuxOperator) {
                return null;
            }
        }

        // can safely convert the join to a map join.
        MapJoinOperator mapJoinOp = MapJoinProcessor.convertJoinOpMapJoinOp(context.conf, joinOp,
                joinOp.getConf().isLeftInputJoin(), joinOp.getConf().getBaseSrc(), joinOp.getConf().getMapAliases(),
                bigTablePosition, true, removeReduceSink);
        mapJoinOp.getConf()
                .setHybridHashJoin(HiveConf.getBoolVar(context.conf, HiveConf.ConfVars.HIVEUSEHYBRIDGRACEHASHJOIN));

        Operator<? extends OperatorDesc> parentBigTableOp = mapJoinOp.getParentOperators().get(bigTablePosition);
        if (parentBigTableOp instanceof ReduceSinkOperator) {
            if (removeReduceSink) {
                for (Operator<?> p : parentBigTableOp.getParentOperators()) {
                    // we might have generated a dynamic partition operator chain. Since
                    // we're removing the reduce sink we need do remove that too.
                    Set<Operator<?>> dynamicPartitionOperators = new HashSet<Operator<?>>();
                    Map<Operator<?>, AppMasterEventOperator> opEventPairs = new HashMap<>();
                    for (Operator<?> c : p.getChildOperators()) {
                        AppMasterEventOperator event = findDynamicPartitionBroadcast(c);
                        if (event != null) {
                            dynamicPartitionOperators.add(c);
                            opEventPairs.put(c, event);
                        }
                    }
                    for (Operator<?> c : dynamicPartitionOperators) {
                        if (context.pruningOpsRemovedByPriorOpt.isEmpty()
                                || !context.pruningOpsRemovedByPriorOpt.contains(opEventPairs.get(c))) {
                            p.removeChild(c);
                            // at this point we've found the fork in the op pipeline that has the pruning as a child plan.
                            LOG.info(
                                    "Disabling dynamic pruning for: "
                                            + ((DynamicPruningEventDesc) opEventPairs.get(c).getConf())
                                                    .getTableScan().getName()
                                            + ". Need to be removed together with reduce sink");
                        }
                    }
                    for (Operator<?> op : dynamicPartitionOperators) {
                        context.pruningOpsRemovedByPriorOpt.add(opEventPairs.get(op));
                    }
                }

                mapJoinOp.getParentOperators().remove(bigTablePosition);
                if (!(mapJoinOp.getParentOperators().contains(parentBigTableOp.getParentOperators().get(0)))) {
                    mapJoinOp.getParentOperators().add(bigTablePosition,
                            parentBigTableOp.getParentOperators().get(0));
                }
                parentBigTableOp.getParentOperators().get(0).removeChild(parentBigTableOp);
            }

            for (Operator<? extends OperatorDesc> op : mapJoinOp.getParentOperators()) {
                if (!(op.getChildOperators().contains(mapJoinOp))) {
                    op.getChildOperators().add(mapJoinOp);
                }
                op.getChildOperators().remove(joinOp);
            }
        }

        return mapJoinOp;
    }

    private AppMasterEventOperator findDynamicPartitionBroadcast(Operator<?> parent) {

        for (Operator<?> op : parent.getChildOperators()) {
            while (op != null) {
                if (op instanceof AppMasterEventOperator && op.getConf() instanceof DynamicPruningEventDesc) {
                    // found dynamic partition pruning operator
                    return (AppMasterEventOperator) op;
                }
                if (op instanceof ReduceSinkOperator || op instanceof FileSinkOperator) {
                    // crossing reduce sink or file sink means the pruning isn't for this parent.
                    break;
                }

                if (op.getChildOperators().size() != 1) {
                    // dynamic partition pruning pipeline doesn't have multiple children
                    break;
                }

                op = op.getChildOperators().get(0);
            }
        }

        return null;
    }

    /**
     * Estimate the number of buckets in the join, using the parent operators' OpTraits and/or
     * parent operators' number of reducers
     * @param joinOp
     * @param useOpTraits  Whether OpTraits should be used for the estimate.
     * @return
     */
    private static int estimateNumBuckets(JoinOperator joinOp, boolean useOpTraits) {
        int numBuckets = -1;
        int estimatedBuckets = -1;

        for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
            if (parentOp.getOpTraits().getNumBuckets() > 0) {
                numBuckets = (numBuckets < parentOp.getOpTraits().getNumBuckets())
                        ? parentOp.getOpTraits().getNumBuckets()
                        : numBuckets;
            }

            if (parentOp instanceof ReduceSinkOperator) {
                ReduceSinkOperator rs = (ReduceSinkOperator) parentOp;
                estimatedBuckets = (estimatedBuckets < rs.getConf().getNumReducers())
                        ? rs.getConf().getNumReducers()
                        : estimatedBuckets;
            }
        }

        if (!useOpTraits) {
            // Ignore the value we got from OpTraits.
            // The logic below will fall back to the estimate from numReducers
            numBuckets = -1;
        }

        if (numBuckets <= 0) {
            numBuckets = estimatedBuckets;
            if (numBuckets <= 0) {
                numBuckets = 1;
            }
        }

        return numBuckets;
    }

    private boolean convertJoinDynamicPartitionedHashJoin(JoinOperator joinOp, OptimizeTezProcContext context)
            throws SemanticException {
        // Attempt dynamic partitioned hash join
        // Since we don't have big table index yet, must start with estimate of numReducers
        int numReducers = estimateNumBuckets(joinOp, false);
        LOG.info("Try dynamic partitioned hash join with estimated " + numReducers + " reducers");
        int bigTablePos = getMapJoinConversionPos(joinOp, context, numReducers);
        if (bigTablePos >= 0) {
            // Now that we have the big table index, get real numReducers value based on big table RS
            ReduceSinkOperator bigTableParentRS = (ReduceSinkOperator) (joinOp.getParentOperators()
                    .get(bigTablePos));
            numReducers = bigTableParentRS.getConf().getNumReducers();
            LOG.debug("Real big table reducers = " + numReducers);

            MapJoinOperator mapJoinOp = convertJoinMapJoin(joinOp, context, bigTablePos, false);
            if (mapJoinOp != null) {
                LOG.info("Selected dynamic partitioned hash join");
                mapJoinOp.getConf().setDynamicPartitionHashJoin(true);
                // Set OpTraits for dynamically partitioned hash join:
                // bucketColNames: Re-use previous joinOp's bucketColNames. Parent operators should be
                //   reduce sink, which should have bucket columns based on the join keys.
                // numBuckets: set to number of reducers
                // sortCols: This is an unsorted join - no sort cols
                OpTraits opTraits = new OpTraits(joinOp.getOpTraits().getBucketColNames(), numReducers, null);
                mapJoinOp.setOpTraits(opTraits);
                mapJoinOp.setStatistics(joinOp.getStatistics());
                // propagate this change till the next RS
                for (Operator<? extends OperatorDesc> childOp : mapJoinOp.getChildOperators()) {
                    setAllChildrenTraits(childOp, mapJoinOp.getOpTraits());
                }
                return true;
            }
        }

        return false;
    }

    private void fallbackToReduceSideJoin(JoinOperator joinOp, OptimizeTezProcContext context)
            throws SemanticException {
        if (context.conf.getBoolVar(HiveConf.ConfVars.HIVECONVERTJOIN)
                && context.conf.getBoolVar(HiveConf.ConfVars.HIVEDYNAMICPARTITIONHASHJOIN)) {
            if (convertJoinDynamicPartitionedHashJoin(joinOp, context)) {
                return;
            }
        }

        // we are just converting to a common merge join operator. The shuffle
        // join in map-reduce case.
        int pos = 0; // it doesn't matter which position we use in this case.
        LOG.info("Fallback to common merge join operator");
        convertJoinSMBJoin(joinOp, context, pos, 0, false);
    }
}