Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more contributor license * agreements. See the NOTICE * file distributed with this work for additional information regarding copyright ownership. The ASF * licenses this file * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in * compliance with the * License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by * applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language * governing permissions and limitations under the License. */ package org.apache.hadoop.hive.ql.parse.mr3; import java.util.ArrayList; import java.util.Deque; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.AppMasterEventOperator; import org.apache.hadoop.hive.ql.exec.FetchTask; import org.apache.hadoop.hive.ql.exec.FileSinkOperator; import org.apache.hadoop.hive.ql.exec.HashTableDummyOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.UnionOperator; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.BaseWork; import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.TableDesc; import com.google.common.collect.BiMap; import com.google.common.collect.HashBiMap; /** * GenMR3Utils is a collection of shared helper methods to produce MR3Work */ public class GenMR3Utils { static final private Log LOG = LogFactory.getLog(GenMR3Utils.class.getName()); // sequence number is used to name vertices (e.g.: Map 1, Reduce 14, ...) private int sequenceNumber = 0; // singleton private static GenMR3Utils utils; public static GenMR3Utils getUtils() { if (utils == null) { utils = new GenMR3Utils(); } return utils; } protected GenMR3Utils() { } public void resetSequenceNumber() { sequenceNumber = 0; } // removes any union operator and clones the plan public void removeUnionOperators(Configuration conf, GenMR3ProcContext context, BaseWork work) throws SemanticException { List<Operator<?>> roots = new ArrayList<Operator<?>>(); roots.addAll(work.getAllRootOperators()); if (work.getDummyOps() != null) { roots.addAll(work.getDummyOps()); } roots.addAll(context.eventOperatorSet); // need to clone the plan. List<Operator<?>> newRoots = Utilities.cloneOperatorTree(conf, roots); // we're cloning the operator plan but we're retaining the original work. That means // that root operators have to be replaced with the cloned ops. The replacement map // tells you what that mapping is. BiMap<Operator<?>, Operator<?>> replacementMap = HashBiMap.create(); // there's some special handling for dummyOps required. Mapjoins won't be properly // initialized if their dummy parents aren't initialized. Since we cloned the plan // we need to replace the dummy operators in the work with the cloned ones. List<HashTableDummyOperator> dummyOps = new LinkedList<HashTableDummyOperator>(); Iterator<Operator<?>> it = newRoots.iterator(); for (Operator<?> orig : roots) { Operator<?> newRoot = it.next(); replacementMap.put(orig, newRoot); if (newRoot instanceof HashTableDummyOperator) { // dummy ops need to be updated to the cloned ones. dummyOps.add((HashTableDummyOperator) newRoot); it.remove(); } else if (newRoot instanceof AppMasterEventOperator) { // event operators point to table scan operators. When cloning these we // need to restore the original scan. if (newRoot.getConf() instanceof DynamicPruningEventDesc) { TableScanOperator ts = ((DynamicPruningEventDesc) orig.getConf()).getTableScan(); if (ts == null) { throw new AssertionError("No table scan associated with dynamic event pruning. " + orig); } ((DynamicPruningEventDesc) newRoot.getConf()).setTableScan(ts); } it.remove(); } else { if (newRoot instanceof TableScanOperator) { if (context.tsToEventMap.containsKey(orig)) { // we need to update event operators with the cloned table scan for (AppMasterEventOperator event : context.tsToEventMap.get(orig)) { ((DynamicPruningEventDesc) event.getConf()).setTableScan((TableScanOperator) newRoot); } } } context.rootToWorkMap.remove(orig); context.rootToWorkMap.put(newRoot, work); } } // now we remove all the unions. we throw away any branch that's not reachable from // the current set of roots. The reason is that those branches will be handled in // different tasks. Deque<Operator<?>> operators = new LinkedList<Operator<?>>(); operators.addAll(newRoots); Set<Operator<?>> seen = new HashSet<Operator<?>>(); while (!operators.isEmpty()) { Operator<?> current = operators.pop(); seen.add(current); if (current instanceof FileSinkOperator) { FileSinkOperator fileSink = (FileSinkOperator) current; // remember it for additional processing later context.fileSinkSet.add(fileSink); FileSinkDesc desc = fileSink.getConf(); Path path = desc.getDirName(); List<FileSinkDesc> linked; if (!context.linkedFileSinks.containsKey(path)) { linked = new ArrayList<FileSinkDesc>(); context.linkedFileSinks.put(path, linked); } linked = context.linkedFileSinks.get(path); linked.add(desc); desc.setDirName(new Path(path, "" + linked.size())); desc.setLinkedFileSinkDesc(linked); } if (current instanceof AppMasterEventOperator) { // remember for additional processing later context.eventOperatorSet.add((AppMasterEventOperator) current); // mark the original as abandoned. Don't need it anymore. context.abandonedEventOperatorSet .add((AppMasterEventOperator) replacementMap.inverse().get(current)); } if (current instanceof UnionOperator) { Operator<?> parent = null; int count = 0; for (Operator<?> op : current.getParentOperators()) { if (seen.contains(op)) { ++count; parent = op; } } // we should have been able to reach the union from only one side. assert count <= 1; if (parent == null) { // root operator is union (can happen in reducers) replacementMap.put(current, current.getChildOperators().get(0)); } else { parent.removeChildAndAdoptItsChildren(current); } } if (current instanceof FileSinkOperator || current instanceof ReduceSinkOperator) { current.setChildOperators(null); } else { operators.addAll(current.getChildOperators()); } } work.setDummyOps(dummyOps); work.replaceRoots(replacementMap); } public void processFileSink(GenMR3ProcContext context, FileSinkOperator fileSink) throws SemanticException { ParseContext parseContext = context.parseContext; boolean isInsertTable = // is INSERT OVERWRITE TABLE GenMapRedUtils.isInsertInto(parseContext, fileSink); HiveConf hconf = parseContext.getConf(); boolean chDir = GenMapRedUtils.isMergeRequired(context.moveTask, hconf, fileSink, context.currentTask, isInsertTable); Path finalName = GenMapRedUtils.createMoveTask(context.currentTask, chDir, fileSink, parseContext, context.moveTask, hconf, context.dependencyTask); if (chDir) { // Merge the files in the destination table/partitions by creating Map-only merge job // If underlying data is RCFile or OrcFile, RCFileBlockMerge task or // OrcFileStripeMerge task would be created. LOG.info("using CombineHiveInputformat for the merge job"); GenMapRedUtils.createMRWorkForMergingFiles(fileSink, finalName, context.dependencyTask, context.moveTask, hconf, context.currentTask); } FetchTask fetchTask = parseContext.getFetchTask(); if (fetchTask != null && context.currentTask.getNumChild() == 0) { if (fetchTask.isFetchFrom(fileSink.getConf())) { context.currentTask.setFetchSource(true); } } } /** * processAppMasterEvent sets up the event descriptor and the MapWork. * * @param procCtx * @param event */ public void processAppMasterEvent(GenMR3ProcContext procCtx, AppMasterEventOperator event) { if (procCtx.abandonedEventOperatorSet.contains(event)) { // don't need this anymore return; } DynamicPruningEventDesc eventDesc = (DynamicPruningEventDesc) event.getConf(); TableScanOperator ts = eventDesc.getTableScan(); MapWork work = (MapWork) procCtx.rootToWorkMap.get(ts); if (work == null) { throw new AssertionError("No work found for tablescan " + ts); } BaseWork enclosingWork = getEnclosingWork(event, procCtx); if (enclosingWork == null) { throw new AssertionError("Cannot find work for operator" + event); } String sourceName = enclosingWork.getName(); // store the vertex name in the operator pipeline eventDesc.setVertexName(work.getName()); eventDesc.setInputName(work.getAliases().get(0)); // store table descriptor in map-work if (!work.getEventSourceTableDescMap().containsKey(sourceName)) { work.getEventSourceTableDescMap().put(sourceName, new LinkedList<TableDesc>()); } List<TableDesc> tables = work.getEventSourceTableDescMap().get(sourceName); tables.add(event.getConf().getTable()); // store column name in map-work if (!work.getEventSourceColumnNameMap().containsKey(sourceName)) { work.getEventSourceColumnNameMap().put(sourceName, new LinkedList<String>()); } List<String> columns = work.getEventSourceColumnNameMap().get(sourceName); columns.add(eventDesc.getTargetColumnName()); // store partition key expr in map-work if (!work.getEventSourcePartKeyExprMap().containsKey(sourceName)) { work.getEventSourcePartKeyExprMap().put(sourceName, new LinkedList<ExprNodeDesc>()); } List<ExprNodeDesc> keys = work.getEventSourcePartKeyExprMap().get(sourceName); keys.add(eventDesc.getPartKey()); } /** * getEncosingWork finds the BaseWork any given operator belongs to. */ public BaseWork getEnclosingWork(Operator<?> op, GenMR3ProcContext procCtx) { List<Operator<?>> ops = new ArrayList<Operator<?>>(); findRoots(op, ops); for (Operator<?> r : ops) { BaseWork work = procCtx.rootToWorkMap.get(r); if (work != null) { return work; } } return null; } /* * findRoots returns all root operators (in ops) that result in operator op */ private void findRoots(Operator<?> op, List<Operator<?>> ops) { List<Operator<?>> parents = op.getParentOperators(); if (parents == null || parents.isEmpty()) { ops.add(op); return; } for (Operator<?> p : parents) { findRoots(p, ops); } } }