org.apache.pig.backend.hadoop.executionengine.tez.util.TezCompilerUtil.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pig.backend.hadoop.executionengine.tez.util.TezCompilerUtil.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.backend.hadoop.executionengine.tez.util;

import java.io.IOException;
import java.lang.reflect.Type;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang.ArrayUtils;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POProject;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POUserFunc;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POForEach;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPackage;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.util.PlanHelper;
import org.apache.pig.backend.hadoop.executionengine.tez.plan.TezEdgeDescriptor;
import org.apache.pig.backend.hadoop.executionengine.tez.plan.TezOperPlan;
import org.apache.pig.backend.hadoop.executionengine.tez.plan.TezOperator;
import org.apache.pig.backend.hadoop.executionengine.tez.plan.operator.POLocalRearrangeTez;
import org.apache.pig.backend.hadoop.executionengine.tez.plan.operator.POStoreTez;
import org.apache.pig.backend.hadoop.executionengine.tez.plan.operator.POValueOutputTez;
import org.apache.pig.backend.hadoop.executionengine.tez.plan.udf.ReadScalarsTez;
import org.apache.pig.backend.hadoop.executionengine.tez.runtime.TezInput;
import org.apache.pig.backend.hadoop.executionengine.tez.runtime.TezOutput;
import org.apache.pig.builtin.RoundRobinPartitioner;
import org.apache.pig.builtin.TOBAG;
import org.apache.pig.data.DataType;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.plan.NodeIdGenerator;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.PlanException;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.tez.dag.api.EdgeProperty.DataMovementType;
import org.apache.tez.runtime.library.input.UnorderedKVInput;
import org.apache.tez.runtime.library.output.UnorderedKVOutput;
import org.apache.tez.runtime.library.output.UnorderedPartitionedKVOutput;

import com.google.common.collect.Lists;

public class TezCompilerUtil {

    public static String TUPLE_CLASS = TupleFactory.getInstance().tupleClass().getName();

    private TezCompilerUtil() {
    }

    // simpleConnectTwoVertex is a utility to end a vertex equivalent to map and start vertex equivalent to
    // reduce in a tez operator:
    // 1. op1 is open
    // 2. op2 is blank
    //    POPackage to start a reduce vertex
    // 3. POLocalRearrange/POPackage are trivial
    // 4. User need to connect op1 to op2 themselves
    static public void simpleConnectTwoVertex(TezOperPlan tezPlan, TezOperator op1, TezOperator op2, String scope,
            NodeIdGenerator nig) throws PlanException {
        PhysicalPlan ep = new PhysicalPlan();
        POProject prjStar = new POProject(new OperatorKey(scope, nig.getNextNodeId(scope)));
        prjStar.setResultType(DataType.TUPLE);
        prjStar.setStar(true);
        ep.add(prjStar);

        List<PhysicalPlan> eps = new ArrayList<PhysicalPlan>();
        eps.add(ep);

        POLocalRearrangeTez lr = new POLocalRearrangeTez(new OperatorKey(scope, nig.getNextNodeId(scope)));
        try {
            lr.setIndex(0);
        } catch (ExecException e) {
            int errCode = 2058;
            String msg = "Unable to set index on the newly created POLocalRearrange.";
            throw new PlanException(msg, errCode, PigException.BUG, e);
        }
        lr.setKeyType(DataType.TUPLE);
        lr.setPlans(eps);
        lr.setResultType(DataType.TUPLE);
        lr.setOutputKey(op2.getOperatorKey().toString());

        op1.plan.addAsLeaf(lr);

        POPackage pkg = new POPackage(new OperatorKey(scope, nig.getNextNodeId(scope)));
        pkg.getPkgr().setKeyType(DataType.TUPLE);
        pkg.setNumInps(1);
        boolean[] inner = { false };
        pkg.getPkgr().setInner(inner);
        op2.plan.add(pkg);

        op2.plan.addAsLeaf(getForEachPlain(scope, nig));

        connect(tezPlan, op1, op2);
    }

    static public TezEdgeDescriptor connect(TezOperPlan plan, TezOperator from, TezOperator to)
            throws PlanException {
        plan.connect(from, to);
        if (!from.plan.isEmpty()) {
            PhysicalOperator leaf = from.plan.getLeaves().get(0);
            // It could be POStoreTez incase of sampling job in order by
            if (leaf instanceof POLocalRearrangeTez) {
                POLocalRearrangeTez lr = (POLocalRearrangeTez) leaf;
                lr.setOutputKey(to.getOperatorKey().toString());
            }
        }
        // Add edge descriptors to old and new operators
        TezEdgeDescriptor edge = new TezEdgeDescriptor();
        to.inEdges.put(from.getOperatorKey(), edge);
        from.outEdges.put(to.getOperatorKey(), edge);
        return edge;
    }

    static public void connect(TezOperPlan plan, TezOperator from, TezOperator to, TezEdgeDescriptor edge)
            throws PlanException {
        plan.connect(from, to);

        // Add edge descriptors to old and new operators
        to.inEdges.put(from.getOperatorKey(), edge);
        from.outEdges.put(to.getOperatorKey(), edge);
    }

    static public void connectTezOpToNewPredecessor(TezOperPlan plan, TezOperator tezOp, TezOperator newPredecessor,
            TezEdgeDescriptor edge, String oldInputKey) throws PlanException {
        plan.connect(newPredecessor, tezOp);
        // Add edge descriptors to old and new operators
        tezOp.inEdges.put(newPredecessor.getOperatorKey(), edge);
        newPredecessor.outEdges.put(tezOp.getOperatorKey(), edge);

        if (oldInputKey != null) {
            replaceInput(tezOp, oldInputKey, newPredecessor.getOperatorKey().toString());
        }
    }

    public static void replaceInput(TezOperator tezOp, String oldInputKey, String newInputKey)
            throws PlanException {
        try {
            List<TezInput> inputs = PlanHelper.getPhysicalOperators(tezOp.plan, TezInput.class);
            for (TezInput input : inputs) {
                input.replaceInput(oldInputKey, newInputKey);
            }
            List<POUserFunc> userFuncs = PlanHelper.getPhysicalOperators(tezOp.plan, POUserFunc.class);
            for (POUserFunc userFunc : userFuncs) {
                if (userFunc.getFunc() instanceof ReadScalarsTez) {
                    TezInput input = (TezInput) userFunc.getFunc();
                    input.replaceInput(oldInputKey, newInputKey);
                    userFunc.getFuncSpec().setCtorArgs(input.getTezInputs());
                }
            }
        } catch (VisitorException e) {
            throw new PlanException(e);
        }
    }

    static public void connectTezOpToNewSuccesor(TezOperPlan plan, TezOperator tezOp, TezOperator newSuccessor,
            TezEdgeDescriptor edge, String oldOutputKey) throws PlanException {
        plan.connect(tezOp, newSuccessor);
        // Add edge descriptors to old and new operators
        newSuccessor.inEdges.put(tezOp.getOperatorKey(), edge);
        tezOp.outEdges.put(newSuccessor.getOperatorKey(), edge);

        if (oldOutputKey != null) {
            replaceOutput(tezOp, oldOutputKey, newSuccessor.getOperatorKey().toString());
        }
    }

    public static void replaceOutput(TezOperator tezOp, String oldOutputKey, String newOutputKey)
            throws PlanException {
        try {
            List<TezOutput> tezOutputs = PlanHelper.getPhysicalOperators(tezOp.plan, TezOutput.class);
            for (TezOutput tezOut : tezOutputs) {
                if (ArrayUtils.contains(tezOut.getTezOutputs(), oldOutputKey)) {
                    tezOut.replaceOutput(oldOutputKey, newOutputKey);
                }
            }
        } catch (VisitorException e) {
            throw new PlanException(e);
        }
    }

    public static boolean isNonPackageInput(String inputKey, TezOperator tezOp) throws PlanException {
        try {
            List<TezInput> inputs = PlanHelper.getPhysicalOperators(tezOp.plan, TezInput.class);
            for (TezInput input : inputs) {
                if (ArrayUtils.contains(input.getTezInputs(), inputKey)) {
                    return true;
                }
            }
            List<POUserFunc> userFuncs = PlanHelper.getPhysicalOperators(tezOp.plan, POUserFunc.class);
            for (POUserFunc userFunc : userFuncs) {
                if (userFunc.getFunc() instanceof ReadScalarsTez) {
                    TezInput input = (TezInput) userFunc.getFunc();
                    if (ArrayUtils.contains(input.getTezInputs(), inputKey)) {
                        return true;
                    }
                }
            }
            return false;
        } catch (VisitorException e) {
            throw new PlanException(e);
        }
    }

    static public POForEach getForEach(POProject project, int rp, String scope, NodeIdGenerator nig) {
        PhysicalPlan forEachPlan = new PhysicalPlan();
        forEachPlan.add(project);

        List<PhysicalPlan> forEachPlans = Lists.newArrayList();
        forEachPlans.add(forEachPlan);

        List<Boolean> flatten = Lists.newArrayList();
        flatten.add(true);

        POForEach forEach = new POForEach(new OperatorKey(scope, nig.getNextNodeId(scope)), rp, forEachPlans,
                flatten);
        forEach.setResultType(DataType.BAG);
        return forEach;
    }

    // Get a plain POForEach: ForEach X generate flatten($1)
    static public POForEach getForEachPlain(String scope, NodeIdGenerator nig) {
        POProject project = new POProject(new OperatorKey(scope, nig.getNextNodeId(scope)));
        project.setResultType(DataType.TUPLE);
        project.setStar(false);
        project.setColumn(1);
        project.setOverloaded(true);
        return getForEach(project, -1, scope, nig);
    }

    static public POStore getStore(String scope, NodeIdGenerator nig) {
        POStore st = new POStoreTez(new OperatorKey(scope, nig.getNextNodeId(scope)));
        // mark store as tmp store. These could be removed by the
        // optimizer, because it wasn't the user requesting it.
        st.setIsTmpStore(true);
        return st;
    }

    static public void setCustomPartitioner(String customPartitioner, TezOperator tezOp) throws IOException {
        if (customPartitioner != null) {
            for (TezEdgeDescriptor edge : tezOp.inEdges.values()) {
                edge.partitionerClass = PigContext.resolveClassName(customPartitioner);
            }
        }
    }

    // Used with POValueOutputTez
    static public void configureValueOnlyTupleOutput(TezEdgeDescriptor edge, DataMovementType dataMovementType) {
        edge.dataMovementType = dataMovementType;
        if (dataMovementType == DataMovementType.BROADCAST || dataMovementType == DataMovementType.ONE_TO_ONE) {
            edge.outputClassName = UnorderedKVOutput.class.getName();
            edge.inputClassName = UnorderedKVInput.class.getName();
        } else if (dataMovementType == DataMovementType.SCATTER_GATHER) {
            edge.outputClassName = UnorderedPartitionedKVOutput.class.getName();
            edge.inputClassName = UnorderedKVInput.class.getName();
            edge.partitionerClass = RoundRobinPartitioner.class;
        }
        edge.setIntermediateOutputKeyClass(POValueOutputTez.EmptyWritable.class.getName());
        edge.setIntermediateOutputValueClass(TUPLE_CLASS);
    }

    public static boolean bagDataTypeInCombinePlan(PhysicalPlan combinePlan) throws ExecException {
        PhysicalOperator lr = combinePlan.getLeaves().get(0);
        POForEach fe = (POForEach) combinePlan.getPredecessors(lr).get(0);

        // Hack. class.getTypeName() is only available in JDK8
        Type dataBagType = new TOBAG().getReturnType();

        List<PhysicalPlan> inputPlans = fe.getInputPlans();
        for (PhysicalPlan inputPlan : inputPlans) {
            PhysicalOperator leaf = inputPlan.getLeaves().get(0);
            if (leaf.getResultType() == DataType.BAG) {
                return true;
            } else if (leaf instanceof POUserFunc) {
                POUserFunc func = (POUserFunc) leaf;
                // Return type of Intermediate func in combiner plan is always Tuple.
                // Need to check original or Final EvalFunc return type
                if (dataBagType.equals(func.getOriginalReturnType())) {
                    return true;
                }
            }
        }
        return false;
    }

}