org.apache.drill.exec.physical.impl.join.HashJoinBatch.java Source code

Introduction

Here is the source code for org.apache.drill.exec.physical.impl.join.HashJoinBatch.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.physical.impl.join;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;

import com.google.common.collect.Lists;
import com.google.common.collect.Sets;

import org.apache.commons.io.FileUtils;
import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.common.expression.FieldReference;
import org.apache.drill.common.expression.PathSegment;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.logical.data.JoinCondition;
import org.apache.drill.common.logical.data.NamedExpression;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.common.types.TypeProtos.DataMode;
import org.apache.drill.common.types.TypeProtos.MajorType;
import org.apache.drill.common.types.Types;
import org.apache.drill.exec.ExecConstants;
import org.apache.drill.exec.exception.ClassTransformationException;
import org.apache.drill.exec.exception.OutOfMemoryException;
import org.apache.drill.exec.exception.SchemaChangeException;
import org.apache.drill.exec.expr.CodeGenerator;
import org.apache.drill.exec.memory.BaseAllocator;
import org.apache.drill.exec.memory.BufferAllocator;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.ops.MetricDef;
import org.apache.drill.exec.physical.base.AbstractBase;
import org.apache.drill.exec.physical.config.HashJoinPOP;
import org.apache.drill.exec.physical.impl.aggregate.SpilledRecordbatch;
import org.apache.drill.exec.physical.impl.common.ChainedHashTable;
import org.apache.drill.exec.physical.impl.common.HashTable;
import org.apache.drill.exec.physical.impl.common.HashTableConfig;
import org.apache.drill.exec.physical.impl.common.HashTableStats;
import org.apache.drill.exec.physical.impl.common.Comparator;
import org.apache.drill.exec.physical.impl.common.HashPartition;
import org.apache.drill.exec.physical.impl.spill.SpillSet;
import org.apache.drill.exec.record.AbstractBinaryRecordBatch;
import org.apache.drill.exec.record.BatchSchema;
import org.apache.drill.exec.record.JoinBatchMemoryManager;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.record.RecordBatch;
import org.apache.drill.exec.record.RecordBatchSizer;
import org.apache.drill.exec.record.VectorWrapper;
import org.apache.drill.exec.vector.IntVector;
import org.apache.drill.exec.vector.ValueVector;
import org.apache.drill.exec.vector.complex.AbstractContainerVector;
import org.apache.calcite.rel.core.JoinRelType;

import static org.apache.drill.exec.record.JoinBatchMemoryManager.LEFT_INDEX;
import static org.apache.drill.exec.record.JoinBatchMemoryManager.RIGHT_INDEX;

/**
 *   This class implements the runtime execution for the Hash-Join operator
 *   supporting INNER, LEFT OUTER, RIGHT OUTER, and FULL OUTER joins
 *
 *   This implementation splits the incoming Build side rows into multiple Partitions, thus allowing spilling of
 *   some of these partitions to disk if memory gets tight. Each partition is implemented as a {@link HashPartition}.
 *   After the build phase is over, in the most general case, some of the partitions were spilled, and the others
 *   are in memory. Each of the partitions in memory would get a {@link HashTable} built.
 *      Next the Probe side is read, and each row is key matched with a Build partition. If that partition is in
 *   memory, then the key is used to probe and perform the join, and the results are added to the outgoing batch.
 *   But if that build side partition was spilled, then the matching Probe size partition is spilled as well.
 *      After all the Probe side was processed, we are left with pairs of spilled partitions. Then each pair is
 *   processed individually (that Build partition should be smaller than the original, hence likely fit whole into
 *   memory to allow probing; if not -- see below).
 *      Processing of each spilled pair is EXACTLY like processing the original Build/Probe incomings. (As a fact,
 *   the {@Link #innerNext() innerNext} method calls itself recursively !!). Thus the spilled build partition is
 *   read and divided into new partitions, which in turn may spill again (and again...).
 *   The code tracks these spilling "cycles". Normally any such "again" (i.e. cycle of 2 or greater) is a waste,
 *   indicating that the number of partitions chosen was too small.
 */
public class HashJoinBatch extends AbstractBinaryRecordBatch<HashJoinPOP> {
    protected static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(HashJoinBatch.class);

    /**
     * The maximum number of records within each internal batch.
     */
    private int RECORDS_PER_BATCH; // internal batches

    // Join type, INNER, LEFT, RIGHT or OUTER
    private final JoinRelType joinType;

    // Join conditions
    private final List<JoinCondition> conditions;

    // Runtime generated class implementing HashJoinProbe interface
    private HashJoinProbe hashJoinProbe = null;

    private final List<NamedExpression> rightExpr;

    /**
     * Names of the join columns. This names are used in order to help estimate the size of the {@link HashTable}s.
     */
    private final Set<String> buildJoinColumns;

    // Fields used for partitioning

    /**
     * The number of {@link HashPartition}s. This is configured via a system option and set in {@link #partitionNumTuning(int, HashJoinMemoryCalculator.BuildSidePartitioning)}.
     */
    private int numPartitions = 1; // must be 2 to the power of bitsInMask
    private int partitionMask = 0; // numPartitions - 1
    private int bitsInMask = 0; // number of bits in the MASK

    /**
     * The master class used to generate {@link HashTable}s.
     */
    private ChainedHashTable baseHashTable;
    private boolean buildSideIsEmpty = true;
    private boolean canSpill = true;
    private boolean wasKilled; // a kill was received, may need to clean spilled partns

    /**
     * This array holds the currently active {@link HashPartition}s.
     */
    HashPartition partitions[];

    // Number of records in the output container
    private int outputRecords;

    // Schema of the build side
    private BatchSchema rightSchema;
    // Schema of the probe side
    private BatchSchema probeSchema;

    private int rightHVColPosition;
    private BufferAllocator allocator;
    // Local fields for left/right incoming - may be replaced when reading from spilled
    private RecordBatch buildBatch;
    private RecordBatch probeBatch;

    // For handling spilling
    private SpillSet spillSet;
    HashJoinPOP popConfig;

    private int cycleNum = 0; // 1-primary, 2-secondary, 3-tertiary, etc.
    private int originalPartition = -1; // the partition a secondary reads from
    IntVector read_right_HV_vector; // HV vector that was read from the spilled batch
    private int maxBatchesInMemory;

    /**
     * This holds information about the spilled partitions for the build and probe side.
     */
    public static class HJSpilledPartition {
        public int innerSpilledBatches;
        public String innerSpillFile;
        public int outerSpilledBatches;
        public String outerSpillFile;
        int cycleNum;
        int origPartn;
        int prevOrigPartn;
    }

    /**
     * Queue of spilled partitions to process.
     */
    private ArrayList<HJSpilledPartition> spilledPartitionsList;
    private HJSpilledPartition spilledInners[]; // for the outer to find the partition

    public enum Metric implements MetricDef {
        NUM_BUCKETS, NUM_ENTRIES, NUM_RESIZING, RESIZING_TIME_MS, NUM_PARTITIONS, SPILLED_PARTITIONS, // number of original partitions spilled to disk
        SPILL_MB, // Number of MB of data spilled to disk. This amount is first written,
        // then later re-read. So, disk I/O is twice this amount.
        SPILL_CYCLE, // 0 - no spill, 1 - spill, 2 - SECONDARY, 3 - TERTIARY
        LEFT_INPUT_BATCH_COUNT, LEFT_AVG_INPUT_BATCH_BYTES, LEFT_AVG_INPUT_ROW_BYTES, LEFT_INPUT_RECORD_COUNT, RIGHT_INPUT_BATCH_COUNT, RIGHT_AVG_INPUT_BATCH_BYTES, RIGHT_AVG_INPUT_ROW_BYTES, RIGHT_INPUT_RECORD_COUNT, OUTPUT_BATCH_COUNT, AVG_OUTPUT_BATCH_BYTES, AVG_OUTPUT_ROW_BYTES, OUTPUT_RECORD_COUNT;

        // duplicate for hash ag

        @Override
        public int metricId() {
            return ordinal();
        }
    }

    @Override
    public int getRecordCount() {
        return outputRecords;
    }

    @Override
    protected void buildSchema() throws SchemaChangeException {
        if (!prefetchFirstBatchFromBothSides()) {
            return;
        }

        // Initialize the hash join helper context
        if (rightUpstream != IterOutcome.NONE) {
            setupHashTable();
        }
        setupOutputContainerSchema();
        try {
            hashJoinProbe = setupHashJoinProbe();
        } catch (IOException | ClassTransformationException e) {
            throw new SchemaChangeException(e);
        }

        container.buildSchema(BatchSchema.SelectionVectorMode.NONE);
    }

    @Override
    protected boolean prefetchFirstBatchFromBothSides() {
        leftUpstream = sniffNonEmptyBatch(0, left);
        rightUpstream = sniffNonEmptyBatch(1, right);

        // For build side, use aggregate i.e. average row width across batches
        batchMemoryManager.update(LEFT_INDEX, 0);
        batchMemoryManager.update(RIGHT_INDEX, 0, true);

        if (logger.isDebugEnabled()) {
            logger.debug("BATCH_STATS, incoming left:\n {}", batchMemoryManager.getRecordBatchSizer(LEFT_INDEX));
            logger.debug("BATCH_STATS, incoming right:\n {}", batchMemoryManager.getRecordBatchSizer(RIGHT_INDEX));
        }

        if (leftUpstream == IterOutcome.STOP || rightUpstream == IterOutcome.STOP) {
            state = BatchState.STOP;
            return false;
        }

        if (leftUpstream == IterOutcome.OUT_OF_MEMORY || rightUpstream == IterOutcome.OUT_OF_MEMORY) {
            state = BatchState.OUT_OF_MEMORY;
            return false;
        }

        if (checkForEarlyFinish(leftUpstream, rightUpstream)) {
            state = BatchState.DONE;
            return false;
        }

        state = BatchState.FIRST; // Got our first batches on both sides
        return true;
    }

    /**
     * Currently in order to accurately predict memory usage for spilling, the first non-empty build side and probe side batches are needed. This method
     * fetches the first non-empty batch from the left or right side.
     * @param inputIndex Index specifying whether to work with the left or right input.
     * @param recordBatch The left or right record batch.
     * @return The {@link org.apache.drill.exec.record.RecordBatch.IterOutcome} for the left or right record batch.
     */
    private IterOutcome sniffNonEmptyBatch(int inputIndex, RecordBatch recordBatch) {
        while (true) {
            IterOutcome outcome = next(inputIndex, recordBatch);

            switch (outcome) {
            case OK_NEW_SCHEMA:
                if (inputIndex == 0) {
                    // Indicate that a schema was seen (in case probe side is empty)
                    probeSchema = probeBatch.getSchema();
                } else {
                    // We need to have the schema of the build side even when the build side is empty
                    rightSchema = buildBatch.getSchema();
                    // position of the new "column" for keeping the hash values (after the real columns)
                    rightHVColPosition = buildBatch.getContainer().getNumberOfColumns();
                    // new schema can also contain records
                }
            case OK:
                if (recordBatch.getRecordCount() == 0) {
                    continue;
                }
                // We got a non empty batch
            default:
                // Other cases termination conditions
                return outcome;
            }
        }
    }

    /**
     * Determines the memory calculator to use. If maxNumBatches is configured simple batch counting is used to spill. Otherwise
     * memory calculations are used to determine when to spill.
     * @return The memory calculator to use.
     */
    public HashJoinMemoryCalculator getCalculatorImpl() {
        if (maxBatchesInMemory == 0) {
            final double safetyFactor = context.getOptions().getDouble(ExecConstants.HASHJOIN_SAFETY_FACTOR_KEY);
            final double fragmentationFactor = context.getOptions()
                    .getDouble(ExecConstants.HASHJOIN_FRAGMENTATION_FACTOR_KEY);
            final double hashTableDoublingFactor = context.getOptions()
                    .getDouble(ExecConstants.HASHJOIN_HASH_DOUBLE_FACTOR_KEY);
            final String hashTableCalculatorType = context.getOptions()
                    .getString(ExecConstants.HASHJOIN_HASHTABLE_CALC_TYPE_KEY);

            return new HashJoinMemoryCalculatorImpl(safetyFactor, fragmentationFactor, hashTableDoublingFactor,
                    hashTableCalculatorType);
        } else {
            return new HashJoinMechanicalMemoryCalculator(maxBatchesInMemory);
        }
    }

    @Override
    public IterOutcome innerNext() {
        // In case incoming was killed before, just cleanup and return
        if (wasKilled) {
            this.cleanup();
            super.close();
            return IterOutcome.NONE;
        }

        try {
            /* If we are here for the first time, execute the build phase of the
             * hash join and setup the run time generated class for the probe side
             */
            if (state == BatchState.FIRST) {
                // Build the hash table, using the build side record batches.
                executeBuildPhase();
                // Update the hash table related stats for the operator
                updateStats();
                // Initialize various settings for the probe side
                hashJoinProbe.setupHashJoinProbe(probeBatch, this, joinType, leftUpstream, partitions, cycleNum,
                        container, spilledInners, buildSideIsEmpty, numPartitions, rightHVColPosition);
            }

            // Try to probe and project, or recursively handle a spilled partition
            if (!buildSideIsEmpty || // If there are build-side rows
                    joinType != JoinRelType.INNER) { // or if this is a left/full outer join

                // Allocate the memory for the vectors in the output container
                batchMemoryManager.allocateVectors(container);
                hashJoinProbe.setTargetOutputCount(batchMemoryManager.getOutputRowCount());

                outputRecords = hashJoinProbe.probeAndProject();

                for (final VectorWrapper<?> v : container) {
                    v.getValueVector().getMutator().setValueCount(outputRecords);
                }
                container.setRecordCount(outputRecords);

                batchMemoryManager.updateOutgoingStats(outputRecords);
                if (logger.isDebugEnabled()) {
                    logger.debug("BATCH_STATS, outgoing:\n {}", new RecordBatchSizer(this));
                }

                /* We are here because of one the following
                 * 1. Completed processing of all the records and we are done
                 * 2. We've filled up the outgoing batch to the maximum and we need to return upstream
                 * Either case build the output container's schema and return
                 */
                if (outputRecords > 0 || state == BatchState.FIRST) {
                    if (state == BatchState.FIRST) {
                        state = BatchState.NOT_FIRST;
                    }

                    return IterOutcome.OK;
                }

                // Free all partitions' in-memory data structures
                // (In case need to start processing spilled partitions)
                for (HashPartition partn : partitions) {
                    partn.cleanup(false); // clean, but do not delete the spill files !!
                }

                //
                //  (recursively) Handle the spilled partitions, if any
                //
                if (!buildSideIsEmpty && !spilledPartitionsList.isEmpty()) {
                    // Get the next (previously) spilled partition to handle as incoming
                    HJSpilledPartition currSp = spilledPartitionsList.remove(0);

                    // Create a BUILD-side "incoming" out of the inner spill file of that partition
                    buildBatch = new SpilledRecordbatch(currSp.innerSpillFile, currSp.innerSpilledBatches, context,
                            rightSchema, oContext, spillSet);
                    // The above ctor call also got the first batch; need to update the outcome
                    rightUpstream = ((SpilledRecordbatch) buildBatch).getInitialOutcome();

                    if (currSp.outerSpilledBatches > 0) {
                        // Create a PROBE-side "incoming" out of the outer spill file of that partition
                        probeBatch = new SpilledRecordbatch(currSp.outerSpillFile, currSp.outerSpilledBatches,
                                context, probeSchema, oContext, spillSet);
                        // The above ctor call also got the first batch; need to update the outcome
                        leftUpstream = ((SpilledRecordbatch) probeBatch).getInitialOutcome();
                    } else {
                        probeBatch = left; // if no outer batch then reuse left - needed for updateIncoming()
                        leftUpstream = IterOutcome.NONE;
                        hashJoinProbe.changeToFinalProbeState();
                    }

                    // update the cycle num if needed
                    // The current cycle num should always be one larger than in the spilled partition
                    if (cycleNum == currSp.cycleNum) {
                        cycleNum = 1 + currSp.cycleNum;
                        stats.setLongStat(Metric.SPILL_CYCLE, cycleNum); // update stats
                        // report first spill or memory stressful situations
                        if (cycleNum == 1) {
                            logger.info("Started reading spilled records ");
                        }
                        if (cycleNum == 2) {
                            logger.info("SECONDARY SPILLING ");
                        }
                        if (cycleNum == 3) {
                            logger.warn("TERTIARY SPILLING ");
                        }
                        if (cycleNum == 4) {
                            logger.warn("QUATERNARY SPILLING ");
                        }
                        if (cycleNum == 5) {
                            logger.warn("QUINARY SPILLING ");
                        }
                        if (cycleNum * bitsInMask > 20) {
                            spilledPartitionsList.add(currSp); // so cleanup() would delete the curr spill files
                            this.cleanup();
                            throw UserException.unsupportedError().message(
                                    "Hash-Join can not partition the inner data any further (probably due to too many join-key duplicates)\n"
                                            + "On cycle num %d mem available %d num partitions %d",
                                    cycleNum, allocator.getLimit(), numPartitions).build(logger);
                        }
                    }
                    logger.debug(
                            "Start reading spilled partition {} (prev {}) from cycle {} (with {}-{} batches)."
                                    + " More {} spilled partitions left.",
                            currSp.origPartn, currSp.prevOrigPartn, currSp.cycleNum, currSp.outerSpilledBatches,
                            currSp.innerSpilledBatches, spilledPartitionsList.size());

                    state = BatchState.FIRST; // TODO need to determine if this is still necessary since prefetchFirstBatchFromBothSides sets this

                    return innerNext(); // start processing the next spilled partition "recursively"
                }

            } else {
                // Our build side is empty, we won't have any matches, clear the probe side
                if (leftUpstream == IterOutcome.OK_NEW_SCHEMA || leftUpstream == IterOutcome.OK) {
                    for (final VectorWrapper<?> wrapper : probeBatch) {
                        wrapper.getValueVector().clear();
                    }
                    probeBatch.kill(true);
                    leftUpstream = next(HashJoinHelper.LEFT_INPUT, probeBatch);
                    while (leftUpstream == IterOutcome.OK_NEW_SCHEMA || leftUpstream == IterOutcome.OK) {
                        for (final VectorWrapper<?> wrapper : probeBatch) {
                            wrapper.getValueVector().clear();
                        }
                        leftUpstream = next(HashJoinHelper.LEFT_INPUT, probeBatch);
                    }
                }
            }

            // No more output records, clean up and return
            state = BatchState.DONE;

            this.cleanup();

            return IterOutcome.NONE;
        } catch (SchemaChangeException e) {
            context.getExecutorState().fail(e);
            killIncoming(false);
            return IterOutcome.STOP;
        }
    }

    private void setupHashTable() throws SchemaChangeException {
        final List<Comparator> comparators = Lists.newArrayListWithExpectedSize(conditions.size());
        conditions.forEach(cond -> comparators.add(JoinUtils.checkAndReturnSupportedJoinComparator(cond)));

        // Setup the hash table configuration object
        List<NamedExpression> leftExpr = new ArrayList<>(conditions.size());

        // Create named expressions from the conditions
        for (int i = 0; i < conditions.size(); i++) {
            leftExpr.add(new NamedExpression(conditions.get(i).getLeft(), new FieldReference("probe_side_" + i)));
        }

        // Set the left named expression to be null if the probe batch is empty.
        if (leftUpstream != IterOutcome.OK_NEW_SCHEMA && leftUpstream != IterOutcome.OK) {
            leftExpr = null;
        } else {
            if (probeBatch.getSchema().getSelectionVectorMode() != BatchSchema.SelectionVectorMode.NONE) {
                final String errorMsg = new StringBuilder()
                        .append("Hash join does not support probe batch with selection vectors. ")
                        .append("Probe batch has selection mode = ")
                        .append(probeBatch.getSchema().getSelectionVectorMode()).toString();
                throw new SchemaChangeException(errorMsg);
            }
        }

        final HashTableConfig htConfig = new HashTableConfig(
                (int) context.getOptions().getOption(ExecConstants.MIN_HASH_TABLE_SIZE), true,
                HashTable.DEFAULT_LOAD_FACTOR, rightExpr, leftExpr, comparators);

        // Create the chained hash table
        baseHashTable = new ChainedHashTable(htConfig, context, allocator, buildBatch, probeBatch, null);
    }

    /**
     *  Call only after num partitions is known
     */
    private void delayedSetup() {
        //
        //  Find out the estimated max batch size, etc
        //  and compute the max numPartitions possible
        //  See partitionNumTuning()
        //

        partitionMask = numPartitions - 1; // e.g. 32 --> 0x1F
        bitsInMask = Integer.bitCount(partitionMask); // e.g. 0x1F -> 5

        // Create the FIFO list of spilled partitions (pairs - inner/outer)
        spilledPartitionsList = new ArrayList<>();

        // Create array for the partitions
        partitions = new HashPartition[numPartitions];

        buildSideIsEmpty = false;
    }

    /**
     * Initialize fields (that may be reused when reading spilled partitions)
     */
    private void initializeBuild() {
        baseHashTable.updateIncoming(buildBatch, probeBatch); // in case we process the spilled files
        // Recreate the partitions every time build is initialized
        for (int part = 0; part < numPartitions; part++) {
            partitions[part] = new HashPartition(context, allocator, baseHashTable, buildBatch, probeBatch,
                    RECORDS_PER_BATCH, spillSet, part, cycleNum, numPartitions);
        }

        spilledInners = new HJSpilledPartition[numPartitions];

    }

    /**
     * Tunes the number of partitions used by {@link HashJoinBatch}. If it is not possible to spill it gives up and reverts
     * to unbounded in memory operation.
     * @param maxBatchSize
     * @param buildCalc
     * @return
     */
    private HashJoinMemoryCalculator.BuildSidePartitioning partitionNumTuning(int maxBatchSize,
            HashJoinMemoryCalculator.BuildSidePartitioning buildCalc) {
        // Get auto tuning result
        numPartitions = buildCalc.getNumPartitions();

        if (logger.isDebugEnabled()) {
            logger.debug(buildCalc.makeDebugString());
        }

        if (buildCalc.getMaxReservedMemory() > allocator.getLimit()) {
            // We don't have enough memory to do any spilling. Give up and do no spilling and have no limits

            // TODO dirty hack to prevent regressions. Remove this once batch sizing is implemented.
            // We don't have enough memory to do partitioning, we have to do everything in memory
            final String message = String.format(
                    "When using the minimum number of partitions %d we require %s memory but only have %s available. "
                            + "Forcing legacy behavoir of using unbounded memory in order to prevent regressions.",
                    numPartitions, FileUtils.byteCountToDisplaySize(buildCalc.getMaxReservedMemory()),
                    FileUtils.byteCountToDisplaySize(allocator.getLimit()));
            logger.warn(message);

            // create a Noop memory calculator
            final HashJoinMemoryCalculator calc = getCalculatorImpl();
            calc.initialize(false);
            buildCalc = calc.next();

            buildCalc.initialize(true, true, // TODO Fix after growing hash values bug fixed
                    buildBatch, probeBatch, buildJoinColumns, allocator.getLimit(), numPartitions,
                    RECORDS_PER_BATCH, RECORDS_PER_BATCH, maxBatchSize, maxBatchSize,
                    batchMemoryManager.getOutputRowCount(), batchMemoryManager.getOutputBatchSize(),
                    HashTable.DEFAULT_LOAD_FACTOR);

            disableSpilling(null);
        }

        return buildCalc;
    }

    /**
     *  Disable spilling - use only a single partition and set the memory limit to the max ( 10GB )
     *  @param reason If not null - log this as warning, else check fallback setting to either warn or fail.
     */
    private void disableSpilling(String reason) {
        // Fail, or just issue a warning if a reason was given, or a fallback option is enabled
        if (reason == null) {
            final boolean fallbackEnabled = context.getOptions()
                    .getOption(ExecConstants.HASHJOIN_FALLBACK_ENABLED_KEY).bool_val;
            if (fallbackEnabled) {
                logger.warn(
                        "Spilling is disabled - not enough memory available for internal partitioning. Falling back"
                                + " to use unbounded memory");
            } else {
                throw UserException.resourceError().message(String.format(
                        "Not enough memory for internal partitioning and fallback mechanism for "
                                + "HashJoin to use unbounded memory is disabled. Either enable fallback config %s using Alter "
                                + "session/system command or increase memory limit for Drillbit",
                        ExecConstants.HASHJOIN_FALLBACK_ENABLED_KEY)).build(logger);
            }
        } else {
            logger.warn(reason);
        }

        numPartitions = 1; // We are only using one partition
        canSpill = false; // We cannot spill
        allocator.setLimit(AbstractBase.MAX_ALLOCATION); // Violate framework and force unbounded memory
    }

    /**
     *  Execute the BUILD phase; first read incoming and split rows into partitions;
     *  may decide to spill some of the partitions
     *
     * @throws SchemaChangeException
     */
    public void executeBuildPhase() throws SchemaChangeException {
        if (rightUpstream == IterOutcome.NONE) {
            // empty right
            return;
        }

        HashJoinMemoryCalculator.BuildSidePartitioning buildCalc;
        boolean firstCycle = cycleNum == 0;

        {
            // Initializing build calculator
            // Limit scope of these variables to this block
            int maxBatchSize = firstCycle ? RecordBatch.MAX_BATCH_SIZE : RECORDS_PER_BATCH;
            boolean hasProbeData = leftUpstream != IterOutcome.NONE;
            boolean doMemoryCalculation = canSpill && hasProbeData;
            HashJoinMemoryCalculator calc = getCalculatorImpl();

            calc.initialize(doMemoryCalculation);
            buildCalc = calc.next();

            // We've sniffed first non empty build and probe batches so we have enough information to create a calculator
            buildCalc.initialize(firstCycle, true, // TODO Fix after growing hash values bug fixed
                    buildBatch, probeBatch, buildJoinColumns, allocator.getLimit(), numPartitions,
                    RECORDS_PER_BATCH, RECORDS_PER_BATCH, maxBatchSize, maxBatchSize,
                    batchMemoryManager.getOutputRowCount(), batchMemoryManager.getOutputBatchSize(),
                    HashTable.DEFAULT_LOAD_FACTOR);

            if (firstCycle && doMemoryCalculation) {
                // Do auto tuning
                buildCalc = partitionNumTuning(maxBatchSize, buildCalc);
            }
        }

        if (firstCycle) {
            // Do initial setup only on the first cycle
            delayedSetup();
        }

        initializeBuild();

        // Make the calculator aware of our partitions
        final HashJoinMemoryCalculator.PartitionStatSet partitionStatSet = new HashJoinMemoryCalculator.PartitionStatSet(
                partitions);
        buildCalc.setPartitionStatSet(partitionStatSet);

        boolean moreData = true;
        while (moreData) {
            switch (rightUpstream) {
            case OUT_OF_MEMORY:
            case NONE:
            case NOT_YET:
            case STOP:
                moreData = false;
                continue;

            case OK_NEW_SCHEMA:
                if (!rightSchema.equals(buildBatch.getSchema())) {
                    throw SchemaChangeException.schemaChanged(
                            "Hash join does not support schema changes in build side.", rightSchema,
                            buildBatch.getSchema());
                }
                for (HashPartition partn : partitions) {
                    partn.updateBatches();
                }
                // Fall through
            case OK:
                batchMemoryManager.update(buildBatch, RIGHT_INDEX, 0, true);
                // Special treatment (when no spill, and single partition) -- use the incoming vectors as they are (no row copy)
                if (numPartitions == 1) {
                    partitions[0].appendBatch(buildBatch);
                    break;
                }
                final int currentRecordCount = buildBatch.getRecordCount();

                if (cycleNum > 0) {
                    read_right_HV_vector = (IntVector) buildBatch.getContainer().getLast();
                }

                // For every record in the build batch, hash the key columns and keep the result
                for (int ind = 0; ind < currentRecordCount; ind++) {
                    int hashCode = (cycleNum == 0) ? partitions[0].getBuildHashCode(ind)
                            : read_right_HV_vector.getAccessor().get(ind); // get the hash value from the HV column
                    int currPart = hashCode & partitionMask;
                    hashCode >>>= bitsInMask;
                    // Append the new inner row to the appropriate partition; spill (that partition) if needed
                    partitions[currPart].appendInnerRow(buildBatch.getContainer(), ind, hashCode, buildCalc); // may spill if needed
                }

                if (read_right_HV_vector != null) {
                    read_right_HV_vector.clear();
                    read_right_HV_vector = null;
                }
                break;
            }
            // Get the next incoming record batch
            rightUpstream = next(HashJoinHelper.RIGHT_INPUT, buildBatch);
        }

        // Move the remaining current batches into their temp lists, or spill
        // them if the partition is spilled. Add the spilled partitions into
        // the spilled partitions list
        if (numPartitions > 1) { // a single partition needs no completion
            for (HashPartition partn : partitions) {
                partn.completeAnInnerBatch(false, partn.isSpilled());
            }
        }

        HashJoinMemoryCalculator.PostBuildCalculations postBuildCalc = buildCalc.next();
        postBuildCalc.initialize();

        //
        //  Traverse all the in-memory partitions' incoming batches, and build their hash tables
        //

        for (int index = 0; index < partitions.length; index++) {
            final HashPartition partn = partitions[index];

            if (partn.isSpilled()) {
                // Don't build hash tables for spilled partitions
                continue;
            }

            try {
                if (postBuildCalc.shouldSpill()) {
                    // Spill this partition if we need to make room
                    partn.spillThisPartition();
                } else {
                    // Only build hash tables for partitions that are not spilled
                    partn.buildContainersHashTableAndHelper();
                }
            } catch (OutOfMemoryException e) {
                final String message = "Failed building hash table on partition " + index + ":\n"
                        + makeDebugString() + "\n" + postBuildCalc.makeDebugString();
                // Include debug info
                throw new OutOfMemoryException(message, e);
            }
        }

        if (logger.isDebugEnabled()) {
            logger.debug(postBuildCalc.makeDebugString());
        }

        for (HashPartition partn : partitions) {
            if (partn.isSpilled()) {
                HJSpilledPartition sp = new HJSpilledPartition();
                sp.innerSpillFile = partn.getSpillFile();
                sp.innerSpilledBatches = partn.getPartitionBatchesCount();
                sp.cycleNum = cycleNum; // remember the current cycle
                sp.origPartn = partn.getPartitionNum(); // for debugging / filename
                sp.prevOrigPartn = originalPartition; // for debugging / filename
                spilledPartitionsList.add(sp);

                spilledInners[partn.getPartitionNum()] = sp; // for the outer to find the SP later
                partn.closeWriter();
            }
        }
    }

    private void setupOutputContainerSchema() {

        if (rightSchema != null) {
            for (final MaterializedField field : rightSchema) {
                final MajorType inputType = field.getType();
                final MajorType outputType;
                // If left or full outer join, then the output type must be nullable. However, map types are
                // not nullable so we must exclude them from the check below (see DRILL-2197).
                if ((joinType == JoinRelType.LEFT || joinType == JoinRelType.FULL)
                        && inputType.getMode() == DataMode.REQUIRED
                        && inputType.getMinorType() != TypeProtos.MinorType.MAP) {
                    outputType = Types.overrideMode(inputType, DataMode.OPTIONAL);
                } else {
                    outputType = inputType;
                }

                // make sure to project field with children for children to show up in the schema
                final MaterializedField projected = field.withType(outputType);
                // Add the vector to our output container
                container.addOrGet(projected);
            }
        }

        if (probeSchema != null) { // a probe schema was seen (even though the probe may had no rows)
            for (final VectorWrapper<?> vv : probeBatch) {
                final MajorType inputType = vv.getField().getType();
                final MajorType outputType;

                // If right or full outer join then the output type should be optional. However, map types are
                // not nullable so we must exclude them from the check below (see DRILL-2771, DRILL-2197).
                if ((joinType == JoinRelType.RIGHT || joinType == JoinRelType.FULL)
                        && inputType.getMode() == DataMode.REQUIRED
                        && inputType.getMinorType() != TypeProtos.MinorType.MAP) {
                    outputType = Types.overrideMode(inputType, DataMode.OPTIONAL);
                } else {
                    outputType = inputType;
                }

                final ValueVector v = container
                        .addOrGet(MaterializedField.create(vv.getField().getName(), outputType));
                if (v instanceof AbstractContainerVector) {
                    vv.getValueVector().makeTransferPair(v);
                    v.clear();
                }
            }
        }

    }

    // (After the inner side was read whole) - Has that inner partition spilled
    public boolean isSpilledInner(int part) {
        if (spilledInners == null) {
            return false;
        } // empty inner
        return spilledInners[part] != null;
    }

    /**
     *  The constructor
     *
     * @param popConfig
     * @param context
     * @param left  -- probe/outer side incoming input
     * @param right -- build/iner side incoming input
     * @throws OutOfMemoryException
     */
    public HashJoinBatch(HashJoinPOP popConfig, FragmentContext context,
            RecordBatch left, /*Probe side record batch*/
            RecordBatch right /*Build side record batch*/
    ) throws OutOfMemoryException {
        super(popConfig, context, true, left, right);
        this.buildBatch = right;
        this.probeBatch = left;
        joinType = popConfig.getJoinType();
        conditions = popConfig.getConditions();
        this.popConfig = popConfig;

        rightExpr = new ArrayList<>(conditions.size());
        buildJoinColumns = Sets.newHashSet();

        for (int i = 0; i < conditions.size(); i++) {
            final SchemaPath rightPath = (SchemaPath) conditions.get(i).getRight();
            final PathSegment.NameSegment nameSegment = (PathSegment.NameSegment) rightPath.getLastSegment();
            buildJoinColumns.add(nameSegment.getPath());
            final String refName = "build_side_" + i;
            rightExpr.add(new NamedExpression(conditions.get(i).getRight(), new FieldReference(refName)));
        }

        this.allocator = oContext.getAllocator();

        numPartitions = (int) context.getOptions().getOption(ExecConstants.HASHJOIN_NUM_PARTITIONS_VALIDATOR);
        if (numPartitions == 1) { //
            disableSpilling("Spilling is disabled due to configuration setting of num_partitions to 1");
        }

        numPartitions = BaseAllocator.nextPowerOfTwo(numPartitions); // in case not a power of 2

        final long memLimit = context.getOptions().getOption(ExecConstants.HASHJOIN_MAX_MEMORY_VALIDATOR);

        if (memLimit != 0) {
            allocator.setLimit(memLimit);
        }

        RECORDS_PER_BATCH = (int) context.getOptions()
                .getOption(ExecConstants.HASHJOIN_NUM_ROWS_IN_BATCH_VALIDATOR);
        maxBatchesInMemory = (int) context.getOptions()
                .getOption(ExecConstants.HASHJOIN_MAX_BATCHES_IN_MEMORY_VALIDATOR);

        logger.info("Memory limit {} bytes", FileUtils.byteCountToDisplaySize(allocator.getLimit()));
        spillSet = new SpillSet(context, popConfig);

        // Create empty partitions (in the ctor - covers the case where right side is empty)
        partitions = new HashPartition[0];

        // get the output batch size from config.
        int configuredBatchSize = (int) context.getOptions().getOption(ExecConstants.OUTPUT_BATCH_SIZE_VALIDATOR);
        batchMemoryManager = new JoinBatchMemoryManager(configuredBatchSize, left, right);
    }

    /**
     * This method is called when {@link HashJoinBatch} closes. It cleans up left over spilled files that are in the spill queue, and closes the
     * spillSet.
     */
    private void cleanup() {
        if (buildSideIsEmpty) {
            return;
        } // not set up; nothing to clean
        if (spillSet.getWriteBytes() > 0) {
            stats.setLongStat(Metric.SPILL_MB, // update stats - total MB spilled
                    (int) Math.round(spillSet.getWriteBytes() / 1024.0D / 1024.0));
        }
        // clean (and deallocate) each partition, and delete its spill file
        for (HashPartition partn : partitions) {
            partn.close();
        }

        // delete any spill file left in unread spilled partitions
        while (!spilledPartitionsList.isEmpty()) {
            HJSpilledPartition sp = spilledPartitionsList.remove(0);
            try {
                spillSet.delete(sp.innerSpillFile);
            } catch (IOException e) {
                logger.warn("Cleanup: Failed to delete spill file {}", sp.innerSpillFile);
            }
            try { // outer file is added later; may be null if cleaning prematurely
                if (sp.outerSpillFile != null) {
                    spillSet.delete(sp.outerSpillFile);
                }
            } catch (IOException e) {
                logger.warn("Cleanup: Failed to delete spill file {}", sp.outerSpillFile);
            }
        }
        // Delete the currently handled (if any) spilled files
        spillSet.close(); // delete the spill directory(ies)
    }

    /**
     * This creates a string that summarizes the memory usage of the operator.
     * @return A memory dump string.
     */
    public String makeDebugString() {
        final StringBuilder sb = new StringBuilder();

        for (int partitionIndex = 0; partitionIndex < partitions.length; partitionIndex++) {
            final String partitionPrefix = "Partition " + partitionIndex + ": ";
            final HashPartition hashPartition = partitions[partitionIndex];
            sb.append(partitionPrefix).append(hashPartition.makeDebugString()).append("\n");
        }

        return sb.toString();
    }

    /**
     * Updates the {@link HashTable} and spilling stats after the original build side is processed.
     *
     * Note: this does not update all the stats. The cycleNum is updated dynamically in {@link #innerNext()} and the total bytes
     * written is updated at close time in {@link #cleanup()}.
     */
    private void updateStats() {
        if (buildSideIsEmpty) {
            return;
        } // no stats when the right side is empty
        if (cycleNum > 0) {
            return;
        } // These stats are only for before processing spilled files

        final HashTableStats htStats = new HashTableStats();
        long numSpilled = 0;
        HashTableStats newStats = new HashTableStats();
        // sum the stats from all the partitions
        for (HashPartition partn : partitions) {
            if (partn.isSpilled()) {
                numSpilled++;
            }
            partn.getStats(newStats);
            htStats.addStats(newStats);
        }

        this.stats.setLongStat(Metric.NUM_BUCKETS, htStats.numBuckets);
        this.stats.setLongStat(Metric.NUM_ENTRIES, htStats.numEntries);
        this.stats.setLongStat(Metric.NUM_RESIZING, htStats.numResizing);
        this.stats.setLongStat(Metric.RESIZING_TIME_MS, htStats.resizingTime);
        this.stats.setLongStat(Metric.NUM_PARTITIONS, numPartitions);
        this.stats.setLongStat(Metric.SPILL_CYCLE, cycleNum); // Put 0 in case no spill
        this.stats.setLongStat(Metric.SPILLED_PARTITIONS, numSpilled);
    }

    @Override
    public void killIncoming(boolean sendUpstream) {
        wasKilled = true;
        probeBatch.kill(sendUpstream);
        buildBatch.kill(sendUpstream);
    }

    public void updateMetrics() {
        stats.setLongStat(HashJoinBatch.Metric.LEFT_INPUT_BATCH_COUNT,
                batchMemoryManager.getNumIncomingBatches(LEFT_INDEX));
        stats.setLongStat(HashJoinBatch.Metric.LEFT_AVG_INPUT_BATCH_BYTES,
                batchMemoryManager.getAvgInputBatchSize(LEFT_INDEX));
        stats.setLongStat(HashJoinBatch.Metric.LEFT_AVG_INPUT_ROW_BYTES,
                batchMemoryManager.getAvgInputRowWidth(LEFT_INDEX));
        stats.setLongStat(HashJoinBatch.Metric.LEFT_INPUT_RECORD_COUNT,
                batchMemoryManager.getTotalInputRecords(LEFT_INDEX));

        stats.setLongStat(HashJoinBatch.Metric.RIGHT_INPUT_BATCH_COUNT,
                batchMemoryManager.getNumIncomingBatches(RIGHT_INDEX));
        stats.setLongStat(HashJoinBatch.Metric.RIGHT_AVG_INPUT_BATCH_BYTES,
                batchMemoryManager.getAvgInputBatchSize(RIGHT_INDEX));
        stats.setLongStat(HashJoinBatch.Metric.RIGHT_AVG_INPUT_ROW_BYTES,
                batchMemoryManager.getAvgInputRowWidth(RIGHT_INDEX));
        stats.setLongStat(HashJoinBatch.Metric.RIGHT_INPUT_RECORD_COUNT,
                batchMemoryManager.getTotalInputRecords(RIGHT_INDEX));

        stats.setLongStat(HashJoinBatch.Metric.OUTPUT_BATCH_COUNT, batchMemoryManager.getNumOutgoingBatches());
        stats.setLongStat(HashJoinBatch.Metric.AVG_OUTPUT_BATCH_BYTES, batchMemoryManager.getAvgOutputBatchSize());
        stats.setLongStat(HashJoinBatch.Metric.AVG_OUTPUT_ROW_BYTES, batchMemoryManager.getAvgOutputRowWidth());
        stats.setLongStat(HashJoinBatch.Metric.OUTPUT_RECORD_COUNT, batchMemoryManager.getTotalOutputRecords());
    }

    @Override
    public void close() {
        if (cycleNum > 0) { // spilling happened
            // In case closing due to cancellation, BaseRootExec.close() does not close the open
            // SpilledRecordBatch "scanners" as it only knows about the original left/right ops.
            killIncoming(false);
        }

        updateMetrics();

        logger.debug(
                "BATCH_STATS, incoming aggregate left: batch count : {}, avg bytes : {},  avg row bytes : {}, record count : {}",
                batchMemoryManager.getNumIncomingBatches(JoinBatchMemoryManager.LEFT_INDEX),
                batchMemoryManager.getAvgInputBatchSize(JoinBatchMemoryManager.LEFT_INDEX),
                batchMemoryManager.getAvgInputRowWidth(JoinBatchMemoryManager.LEFT_INDEX),
                batchMemoryManager.getTotalInputRecords(JoinBatchMemoryManager.LEFT_INDEX));

        logger.debug(
                "BATCH_STATS, incoming aggregate right: batch count : {}, avg bytes : {},  avg row bytes : {}, record count : {}",
                batchMemoryManager.getNumIncomingBatches(JoinBatchMemoryManager.RIGHT_INDEX),
                batchMemoryManager.getAvgInputBatchSize(JoinBatchMemoryManager.RIGHT_INDEX),
                batchMemoryManager.getAvgInputRowWidth(JoinBatchMemoryManager.RIGHT_INDEX),
                batchMemoryManager.getTotalInputRecords(JoinBatchMemoryManager.RIGHT_INDEX));

        logger.debug(
                "BATCH_STATS, outgoing aggregate: batch count : {}, avg bytes : {},  avg row bytes : {}, record count : {}",
                batchMemoryManager.getNumOutgoingBatches(), batchMemoryManager.getAvgOutputBatchSize(),
                batchMemoryManager.getAvgOutputRowWidth(), batchMemoryManager.getTotalOutputRecords());

        this.cleanup();
        super.close();
    }

    public HashJoinProbe setupHashJoinProbe() throws ClassTransformationException, IOException {
        final CodeGenerator<HashJoinProbe> cg = CodeGenerator.get(HashJoinProbe.TEMPLATE_DEFINITION,
                context.getOptions());
        cg.plainJavaCapable(true);
        // cg.saveCodeForDebugging(true);

        //  No real code generation !!

        final HashJoinProbe hj = context.getImplementationClass(cg);
        return hj;
    }

}