org.apache.hadoop.hive.ql.parse.mr3.GenMR3Utils.java Source code

Introduction

Here is the source code for org.apache.hadoop.hive.ql.parse.mr3.GenMR3Utils.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
 * agreements. See the NOTICE
 * file distributed with this work for additional information regarding copyright ownership. The ASF
 * licenses this file
 * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in
 * compliance with the
 * License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by
 * applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language
 * governing permissions and limitations under the License.
 */

package org.apache.hadoop.hive.ql.parse.mr3;

import java.util.ArrayList;
import java.util.Deque;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.AppMasterEventOperator;
import org.apache.hadoop.hive.ql.exec.FetchTask;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.exec.HashTableDummyOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.UnionOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.BaseWork;
import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.hive.ql.plan.TableDesc;

import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;

/**
 * GenMR3Utils is a collection of shared helper methods to produce MR3Work
 */
public class GenMR3Utils {
    static final private Log LOG = LogFactory.getLog(GenMR3Utils.class.getName());

    // sequence number is used to name vertices (e.g.: Map 1, Reduce 14, ...)
    private int sequenceNumber = 0;

    // singleton
    private static GenMR3Utils utils;

    public static GenMR3Utils getUtils() {
        if (utils == null) {
            utils = new GenMR3Utils();
        }
        return utils;
    }

    protected GenMR3Utils() {
    }

    public void resetSequenceNumber() {
        sequenceNumber = 0;
    }

    // removes any union operator and clones the plan
    public void removeUnionOperators(Configuration conf, GenMR3ProcContext context, BaseWork work)
            throws SemanticException {

        List<Operator<?>> roots = new ArrayList<Operator<?>>();
        roots.addAll(work.getAllRootOperators());
        if (work.getDummyOps() != null) {
            roots.addAll(work.getDummyOps());
        }
        roots.addAll(context.eventOperatorSet);

        // need to clone the plan.
        List<Operator<?>> newRoots = Utilities.cloneOperatorTree(conf, roots);

        // we're cloning the operator plan but we're retaining the original work. That means
        // that root operators have to be replaced with the cloned ops. The replacement map
        // tells you what that mapping is.
        BiMap<Operator<?>, Operator<?>> replacementMap = HashBiMap.create();

        // there's some special handling for dummyOps required. Mapjoins won't be properly
        // initialized if their dummy parents aren't initialized. Since we cloned the plan
        // we need to replace the dummy operators in the work with the cloned ones.
        List<HashTableDummyOperator> dummyOps = new LinkedList<HashTableDummyOperator>();

        Iterator<Operator<?>> it = newRoots.iterator();
        for (Operator<?> orig : roots) {
            Operator<?> newRoot = it.next();

            replacementMap.put(orig, newRoot);

            if (newRoot instanceof HashTableDummyOperator) {
                // dummy ops need to be updated to the cloned ones.
                dummyOps.add((HashTableDummyOperator) newRoot);
                it.remove();
            } else if (newRoot instanceof AppMasterEventOperator) {
                // event operators point to table scan operators. When cloning these we
                // need to restore the original scan.
                if (newRoot.getConf() instanceof DynamicPruningEventDesc) {
                    TableScanOperator ts = ((DynamicPruningEventDesc) orig.getConf()).getTableScan();
                    if (ts == null) {
                        throw new AssertionError("No table scan associated with dynamic event pruning. " + orig);
                    }
                    ((DynamicPruningEventDesc) newRoot.getConf()).setTableScan(ts);
                }
                it.remove();
            } else {
                if (newRoot instanceof TableScanOperator) {
                    if (context.tsToEventMap.containsKey(orig)) {
                        // we need to update event operators with the cloned table scan
                        for (AppMasterEventOperator event : context.tsToEventMap.get(orig)) {
                            ((DynamicPruningEventDesc) event.getConf()).setTableScan((TableScanOperator) newRoot);
                        }
                    }
                }
                context.rootToWorkMap.remove(orig);
                context.rootToWorkMap.put(newRoot, work);
            }
        }

        // now we remove all the unions. we throw away any branch that's not reachable from
        // the current set of roots. The reason is that those branches will be handled in
        // different tasks.
        Deque<Operator<?>> operators = new LinkedList<Operator<?>>();
        operators.addAll(newRoots);

        Set<Operator<?>> seen = new HashSet<Operator<?>>();

        while (!operators.isEmpty()) {
            Operator<?> current = operators.pop();
            seen.add(current);

            if (current instanceof FileSinkOperator) {
                FileSinkOperator fileSink = (FileSinkOperator) current;

                // remember it for additional processing later
                context.fileSinkSet.add(fileSink);

                FileSinkDesc desc = fileSink.getConf();
                Path path = desc.getDirName();
                List<FileSinkDesc> linked;

                if (!context.linkedFileSinks.containsKey(path)) {
                    linked = new ArrayList<FileSinkDesc>();
                    context.linkedFileSinks.put(path, linked);
                }
                linked = context.linkedFileSinks.get(path);
                linked.add(desc);

                desc.setDirName(new Path(path, "" + linked.size()));
                desc.setLinkedFileSinkDesc(linked);
            }

            if (current instanceof AppMasterEventOperator) {
                // remember for additional processing later
                context.eventOperatorSet.add((AppMasterEventOperator) current);

                // mark the original as abandoned. Don't need it anymore.
                context.abandonedEventOperatorSet
                        .add((AppMasterEventOperator) replacementMap.inverse().get(current));
            }

            if (current instanceof UnionOperator) {
                Operator<?> parent = null;
                int count = 0;

                for (Operator<?> op : current.getParentOperators()) {
                    if (seen.contains(op)) {
                        ++count;
                        parent = op;
                    }
                }

                // we should have been able to reach the union from only one side.
                assert count <= 1;

                if (parent == null) {
                    // root operator is union (can happen in reducers)
                    replacementMap.put(current, current.getChildOperators().get(0));
                } else {
                    parent.removeChildAndAdoptItsChildren(current);
                }
            }

            if (current instanceof FileSinkOperator || current instanceof ReduceSinkOperator) {
                current.setChildOperators(null);
            } else {
                operators.addAll(current.getChildOperators());
            }
        }
        work.setDummyOps(dummyOps);
        work.replaceRoots(replacementMap);
    }

    public void processFileSink(GenMR3ProcContext context, FileSinkOperator fileSink) throws SemanticException {

        ParseContext parseContext = context.parseContext;

        boolean isInsertTable = // is INSERT OVERWRITE TABLE
                GenMapRedUtils.isInsertInto(parseContext, fileSink);
        HiveConf hconf = parseContext.getConf();

        boolean chDir = GenMapRedUtils.isMergeRequired(context.moveTask, hconf, fileSink, context.currentTask,
                isInsertTable);

        Path finalName = GenMapRedUtils.createMoveTask(context.currentTask, chDir, fileSink, parseContext,
                context.moveTask, hconf, context.dependencyTask);

        if (chDir) {
            // Merge the files in the destination table/partitions by creating Map-only merge job
            // If underlying data is RCFile or OrcFile, RCFileBlockMerge task or
            // OrcFileStripeMerge task would be created.
            LOG.info("using CombineHiveInputformat for the merge job");
            GenMapRedUtils.createMRWorkForMergingFiles(fileSink, finalName, context.dependencyTask,
                    context.moveTask, hconf, context.currentTask);
        }

        FetchTask fetchTask = parseContext.getFetchTask();
        if (fetchTask != null && context.currentTask.getNumChild() == 0) {
            if (fetchTask.isFetchFrom(fileSink.getConf())) {
                context.currentTask.setFetchSource(true);
            }
        }
    }

    /**
     * processAppMasterEvent sets up the event descriptor and the MapWork.
     *
     * @param procCtx
     * @param event
     */
    public void processAppMasterEvent(GenMR3ProcContext procCtx, AppMasterEventOperator event) {

        if (procCtx.abandonedEventOperatorSet.contains(event)) {
            // don't need this anymore
            return;
        }

        DynamicPruningEventDesc eventDesc = (DynamicPruningEventDesc) event.getConf();
        TableScanOperator ts = eventDesc.getTableScan();

        MapWork work = (MapWork) procCtx.rootToWorkMap.get(ts);
        if (work == null) {
            throw new AssertionError("No work found for tablescan " + ts);
        }

        BaseWork enclosingWork = getEnclosingWork(event, procCtx);
        if (enclosingWork == null) {
            throw new AssertionError("Cannot find work for operator" + event);
        }
        String sourceName = enclosingWork.getName();

        // store the vertex name in the operator pipeline
        eventDesc.setVertexName(work.getName());
        eventDesc.setInputName(work.getAliases().get(0));

        // store table descriptor in map-work
        if (!work.getEventSourceTableDescMap().containsKey(sourceName)) {
            work.getEventSourceTableDescMap().put(sourceName, new LinkedList<TableDesc>());
        }
        List<TableDesc> tables = work.getEventSourceTableDescMap().get(sourceName);
        tables.add(event.getConf().getTable());

        // store column name in map-work
        if (!work.getEventSourceColumnNameMap().containsKey(sourceName)) {
            work.getEventSourceColumnNameMap().put(sourceName, new LinkedList<String>());
        }
        List<String> columns = work.getEventSourceColumnNameMap().get(sourceName);
        columns.add(eventDesc.getTargetColumnName());

        // store partition key expr in map-work
        if (!work.getEventSourcePartKeyExprMap().containsKey(sourceName)) {
            work.getEventSourcePartKeyExprMap().put(sourceName, new LinkedList<ExprNodeDesc>());
        }
        List<ExprNodeDesc> keys = work.getEventSourcePartKeyExprMap().get(sourceName);
        keys.add(eventDesc.getPartKey());
    }

    /**
     * getEncosingWork finds the BaseWork any given operator belongs to.
     */
    public BaseWork getEnclosingWork(Operator<?> op, GenMR3ProcContext procCtx) {
        List<Operator<?>> ops = new ArrayList<Operator<?>>();
        findRoots(op, ops);
        for (Operator<?> r : ops) {
            BaseWork work = procCtx.rootToWorkMap.get(r);
            if (work != null) {
                return work;
            }
        }
        return null;
    }

    /*
     * findRoots returns all root operators (in ops) that result in operator op
     */
    private void findRoots(Operator<?> op, List<Operator<?>> ops) {
        List<Operator<?>> parents = op.getParentOperators();
        if (parents == null || parents.isEmpty()) {
            ops.add(op);
            return;
        }
        for (Operator<?> p : parents) {
            findRoots(p, ops);
        }
    }
}