org.apache.tez.dag.api.DAG.java Source code

Introduction

Here is the source code for org.apache.tez.dag.api.DAG.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tez.dag.api;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Stack;

import org.apache.commons.collections4.BidiMap;
import org.apache.commons.collections4.bidimap.DualLinkedHashBidiMap;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.tez.client.CallerContext;
import org.apache.tez.common.JavaOptsChecker;
import org.apache.tez.common.TezUtils;
import org.apache.tez.dag.api.Vertex.VertexExecutionContext;
import org.apache.tez.dag.api.records.DAGProtos;
import org.apache.tez.serviceplugins.api.ServicePluginsDescriptor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.tez.client.TezClientUtils;
import org.apache.tez.common.security.DAGAccessControls;
import org.apache.tez.common.TezCommonUtils;
import org.apache.tez.common.TezYARNUtils;
import org.apache.tez.dag.api.EdgeProperty.DataMovementType;
import org.apache.tez.dag.api.EdgeProperty.DataSourceType;
import org.apache.tez.dag.api.EdgeProperty.SchedulingType;
import org.apache.tez.dag.api.VertexGroup.GroupInfo;
import org.apache.tez.dag.api.records.DAGProtos.ConfigurationProto;
import org.apache.tez.dag.api.records.DAGProtos.DAGPlan;
import org.apache.tez.dag.api.records.DAGProtos.EdgePlan;
import org.apache.tez.dag.api.records.DAGProtos.PlanGroupInputEdgeInfo;
import org.apache.tez.dag.api.records.DAGProtos.PlanKeyValuePair;
import org.apache.tez.dag.api.records.DAGProtos.PlanTaskConfiguration;
import org.apache.tez.dag.api.records.DAGProtos.PlanTaskLocationHint;
import org.apache.tez.dag.api.records.DAGProtos.PlanVertexGroupInfo;
import org.apache.tez.dag.api.records.DAGProtos.PlanVertexType;
import org.apache.tez.dag.api.records.DAGProtos.VertexPlan;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

/**
 * Top level entity that defines the DAG (Directed Acyclic Graph) representing
 * the data flow graph. Consists of a set of Vertices and Edges connecting the
 * vertices. Vertices represent transformations of data and edges represent
 * movement of data between vertices.
 */
@Public
public class DAG {

    private static final Logger LOG = LoggerFactory.getLogger(DAG.class);

    final BidiMap<String, Vertex> vertices = new DualLinkedHashBidiMap<String, Vertex>();
    final Set<Edge> edges = Sets.newHashSet();
    final String name;
    final Collection<URI> urisForCredentials = new HashSet<URI>();
    Credentials credentials = new Credentials();
    Set<VertexGroup> vertexGroups = Sets.newHashSet();

    Set<GroupInputEdge> groupInputEdges = Sets.newHashSet();

    private DAGAccessControls dagAccessControls;
    Map<String, LocalResource> commonTaskLocalFiles = Maps.newHashMap();
    String dagInfo;
    CallerContext callerContext;
    private Map<String, String> dagConf = new HashMap<String, String>();
    private VertexExecutionContext defaultExecutionContext;

    private DAG(String name) {
        this.name = name;
    }

    /**
     * Create a DAG with the specified name.
     * @param name the name of the DAG
     * @return this {@link DAG}
     */
    public static DAG create(String name) {
        return new DAG(name);
    }

    /**
     * Set the files etc that must be provided to the tasks of this DAG
     * @param localFiles
     *          files that must be available locally for each task. These files
     *          may be regular files, archives etc. as specified by the value
     *          elements of the map.
     * @return {@link DAG}
     */
    public synchronized DAG addTaskLocalFiles(Map<String, LocalResource> localFiles) {
        Preconditions.checkNotNull(localFiles);
        TezCommonUtils.addAdditionalLocalResources(localFiles, commonTaskLocalFiles, "DAG " + getName());
        return this;
    }

    public synchronized DAG addVertex(Vertex vertex) {
        if (vertices.containsKey(vertex.getName())) {
            throw new IllegalStateException("Vertex " + vertex.getName() + " already defined!");
        }
        vertices.put(vertex.getName(), vertex);
        return this;
    }

    public synchronized Vertex getVertex(String vertexName) {
        return vertices.get(vertexName);
    }

    /**
     * One of the methods that can be used to provide information about required
     * Credentials when running on a secure cluster. A combination of this and
     * addURIsForCredentials should be used to specify information about all
     * credentials required by a DAG. AM specific credentials are not used when
     * executing a DAG.
     *
     * Set credentials which will be required to run this dag. This method can be
     * used if the client has already obtained some or all of the required
     * credentials.
     *
     * @param credentials Credentials for the DAG
     * @return {@link DAG}
     */
    public synchronized DAG setCredentials(Credentials credentials) {
        this.credentials = credentials;
        return this;
    }

    /**
     * Set description info for this DAG that can be used for visualization purposes.
     * @param dagInfo JSON blob as a serialized string.
     *                Recognized keys by the UI are:
     *                    "context" - The application context in which this DAG is being used.
     *                                For example, this could be set to "Hive" or "Pig" if
     *                                this is being run as part of a Hive or Pig script.
     *                    "description" - General description on what this DAG is going to do.
     *                                In the case of Hive, this could be the SQL query text.
     * @return {@link DAG}
     */
    @Deprecated
    public synchronized DAG setDAGInfo(String dagInfo) {
        Preconditions.checkNotNull(dagInfo);
        this.dagInfo = dagInfo;
        return this;
    }

    /**
     * Set the Context in which Tez is being called.
     * @param callerContext Caller Context
     * @return {@link DAG}
     */
    public synchronized DAG setCallerContext(CallerContext callerContext) {
        Preconditions.checkNotNull(callerContext);
        this.callerContext = callerContext;
        return this;
    }

    /**
     * Create a group of vertices that share a common output. This can be used to implement
     * unions efficiently.
     * @param name Name of the group.
     * @param members {@link Vertex} members of the group
     * @return {@link DAG}
     */
    public synchronized VertexGroup createVertexGroup(String name, Vertex... members) {
        // vertex group name should be unique.
        VertexGroup uv = new VertexGroup(name, members);
        if (!vertexGroups.add(uv)) {
            throw new IllegalStateException("VertexGroup " + name + " already defined!");
        }

        return uv;
    }

    @Private
    public synchronized Credentials getCredentials() {
        return this.credentials;
    }

    /**
     * Set Access controls for the DAG. Which user/groups can view the DAG progess/history and
     * who can modify the DAG i.e. kill the DAG.
     * The owner of the Tez Session and the user submitting the DAG are super-users and have access
     * to all operations on the DAG.
     * @param accessControls Access Controls
     * @return {@link DAG}
     */
    public synchronized DAG setAccessControls(DAGAccessControls accessControls) {
        this.dagAccessControls = accessControls;
        return this;
    }

    @Private
    public synchronized DAGAccessControls getDagAccessControls() {
        return dagAccessControls;
    }

    /**
     * One of the methods that can be used to provide information about required
     * Credentials when running on a secure cluster. A combination of this and
     * setCredentials should be used to specify information about all credentials
     * required by a DAG. AM specific credentials are not used when executing a
     * DAG.
     *
     * This method can be used to specify a list of URIs for which Credentials
     * need to be obtained so that the job can run. An incremental list of URIs
     * can be provided by making multiple calls to the method.
     *
     * Currently, @{link credentials} can only be fetched for HDFS and other
     * {@link org.apache.hadoop.fs.FileSystem} implementations that support
     * credentials.
     *
     * @param uris
     *          a list of {@link URI}s
     * @return {@link DAG}
     */
    public synchronized DAG addURIsForCredentials(Collection<URI> uris) {
        Preconditions.checkNotNull(uris, "URIs cannot be null");
        urisForCredentials.addAll(uris);
        return this;
    }

    /**
     *
     * @return an unmodifiable list representing the URIs for which credentials
     *         are required.
     */
    @Private
    public synchronized Collection<URI> getURIsForCredentials() {
        return Collections.unmodifiableCollection(urisForCredentials);
    }

    @Private
    public synchronized Set<Vertex> getVertices() {
        return Collections.unmodifiableSet(this.vertices.values());
    }

    /**
     * Add an {@link Edge} connecting vertices in the DAG
     * @param edge The edge to be added
     * @return {@link DAG}
     */
    public synchronized DAG addEdge(Edge edge) {
        // Sanity checks
        if (!vertices.containsValue(edge.getInputVertex())) {
            throw new IllegalArgumentException("Input vertex " + edge.getInputVertex() + " doesn't exist!");
        }
        if (!vertices.containsValue(edge.getOutputVertex())) {
            throw new IllegalArgumentException("Output vertex " + edge.getOutputVertex() + " doesn't exist!");
        }
        if (edges.contains(edge)) {
            throw new IllegalArgumentException("Edge " + edge + " already defined!");
        }

        // inform the vertices
        edge.getInputVertex().addOutputVertex(edge.getOutputVertex(), edge);
        edge.getOutputVertex().addInputVertex(edge.getInputVertex(), edge);

        edges.add(edge);
        return this;
    }

    /**
     * Add a {@link GroupInputEdge} to the DAG.
     * @param edge {@link GroupInputEdge}
     * @return {@link DAG}
     */
    public synchronized DAG addEdge(GroupInputEdge edge) {
        // Sanity checks
        if (!vertexGroups.contains(edge.getInputVertexGroup())) {
            throw new IllegalArgumentException("Input vertex " + edge.getInputVertexGroup() + " doesn't exist!");
        }
        if (!vertices.containsValue(edge.getOutputVertex())) {
            throw new IllegalArgumentException("Output vertex " + edge.getOutputVertex() + " doesn't exist!");
        }
        if (groupInputEdges.contains(edge)) {
            throw new IllegalArgumentException("GroupInputEdge " + edge + " already defined!");
        }

        VertexGroup av = edge.getInputVertexGroup();
        av.addOutputVertex(edge.getOutputVertex(), edge);
        groupInputEdges.add(edge);

        // add new edge between members of VertexGroup and destVertex of the GroupInputEdge
        List<Edge> newEdges = Lists.newLinkedList();
        Vertex dstVertex = edge.getOutputVertex();
        VertexGroup uv = edge.getInputVertexGroup();
        for (Vertex member : uv.getMembers()) {
            newEdges.add(Edge.create(member, dstVertex, edge.getEdgeProperty()));
        }
        dstVertex.addGroupInput(uv.getGroupName(), uv.getGroupInfo());

        for (Edge e : newEdges) {
            addEdge(e);
        }

        return this;
    }

    /**
     * Get the DAG name
     * @return DAG name
     */
    public String getName() {
        return this.name;
    }

    /**
     * This is currently used to setup additional configuration parameters which will be available
     * in the DAG configuration used in the AppMaster. This API would be used for properties which
     * are used by the Tez framework while executing the DAG. As an example, the number of attempts
     * for a task.</p>
     *
     * A DAG inherits it's base properties from the ApplicationMaster within which it's running. This
     * method allows for these properties to be overridden.
     *
     * Currently, properties which are used by the task runtime, such as the task to AM
     * heartbeat interval, cannot be changed using this method. </p>
     *
     * Note: This API does not add any configuration to runtime components such as InputInitializers,
     * OutputCommitters, Inputs and Outputs.
     *
     * @param property the property name
     * @param value the value for the property
     * @return the current DAG being constructed
     */
    @InterfaceStability.Unstable
    public DAG setConf(String property, String value) {
        TezConfiguration.validateProperty(property, Scope.DAG);
        dagConf.put(property, value);
        return this;
    }

    /**
     * Set history log level for this DAG. This config overrides the default or one set at the session
     * level.
     *
     * @param historyLogLevel The ATS history log level for this DAG.
     *
     * @return this DAG
     */
    public DAG setHistoryLogLevel(HistoryLogLevel historyLogLevel) {
        return this.setConf(TezConfiguration.TEZ_HISTORY_LOGGING_LOGLEVEL, historyLogLevel.name());
    }

    /**
     * Sets the default execution context for the DAG. This can be overridden at a per Vertex level.
     * See {@link org.apache.tez.dag.api.Vertex#setExecutionContext(VertexExecutionContext)}
     *
     * @param vertexExecutionContext the default execution context for the DAG
     *
     * @return this DAG
     */
    @Public
    @InterfaceStability.Unstable
    public synchronized DAG setExecutionContext(VertexExecutionContext vertexExecutionContext) {
        this.defaultExecutionContext = vertexExecutionContext;
        return this;
    }

    @Private
    VertexExecutionContext getDefaultExecutionContext() {
        return this.defaultExecutionContext;
    }

    @Private
    @VisibleForTesting
    public Map<String, String> getDagConf() {
        return dagConf;
    }

    @Private
    public Map<String, LocalResource> getTaskLocalFiles() {
        return commonTaskLocalFiles;
    }

    @Private
    @VisibleForTesting
    void checkAndInferOneToOneParallelism() {
        // infer all 1-1 via dependencies
        // collect all 1-1 edges where the source parallelism is set
        Set<Vertex> newKnownTasksVertices = Sets.newHashSet();
        for (Vertex vertex : vertices.values()) {
            if (vertex.getParallelism() > -1) {
                newKnownTasksVertices.add(vertex);
            }
        }

        // walk through all known source 1-1 edges and infer parallelism
        // add newly inferred vertices for consideration as known sources
        // the outer loop will run for every new level of inferring the parallelism
        // however, the entire logic will process each vertex only once
        while (!newKnownTasksVertices.isEmpty()) {
            Set<Vertex> knownTasksVertices = Sets.newHashSet(newKnownTasksVertices);
            newKnownTasksVertices.clear();
            for (Vertex v : knownTasksVertices) {
                for (Edge e : v.getOutputEdges()) {
                    if (e.getEdgeProperty().getDataMovementType() == DataMovementType.ONE_TO_ONE) {
                        Vertex outVertex = e.getOutputVertex();
                        if (outVertex.getParallelism() == -1) {
                            LOG.info("Inferring parallelism for vertex: " + outVertex.getName() + " to be "
                                    + v.getParallelism() + " from 1-1 connection with vertex " + v.getName());
                            outVertex.setParallelism(v.getParallelism());
                            newKnownTasksVertices.add(outVertex);
                        }
                    }
                }
            }
        }

        // check for inconsistency and errors
        for (Edge e : edges) {
            Vertex inputVertex = e.getInputVertex();
            Vertex outputVertex = e.getOutputVertex();

            if (e.getEdgeProperty().getDataMovementType() == DataMovementType.ONE_TO_ONE) {
                if (inputVertex.getParallelism() != outputVertex.getParallelism()) {
                    // both should be equal or equal to -1.
                    if (outputVertex.getParallelism() != -1) {
                        throw new TezUncheckedException(
                                "1-1 Edge. Destination vertex parallelism must match source vertex. " + "Vertex: "
                                        + inputVertex.getName() + " does not match vertex: "
                                        + outputVertex.getName());
                    }
                }
            }
        }

        // check the vertices with -1 parallelism, currently only 3 cases are allowed to has -1 parallelism.
        // It is OK not using topological order to check vertices here.
        // 1. has input initializers
        // 2. 1-1 uninited sources
        // 3. has custom vertex manager
        for (Vertex vertex : vertices.values()) {
            if (vertex.getParallelism() == -1) {
                boolean hasInputInitializer = false;
                if (vertex.getDataSources() != null && !vertex.getDataSources().isEmpty()) {
                    for (DataSourceDescriptor ds : vertex.getDataSources()) {
                        if (ds.getInputInitializerDescriptor() != null) {
                            hasInputInitializer = true;
                            break;
                        }
                    }
                }
                if (hasInputInitializer) {
                    continue;
                } else {
                    // Account for the case where the vertex has a data source with a determined number of
                    // shards e.g. splits calculated on the client and not in the AM
                    // In this case, vertex parallelism is setup later using the data source's numShards
                    // and as a result, an initializer is not needed.
                    if (vertex.getDataSources() != null && vertex.getDataSources().size() == 1
                            && vertex.getDataSources().get(0).getNumberOfShards() > -1) {
                        continue;
                    }
                }

                boolean has1to1UninitedSources = false;
                if (vertex.getInputVertices() != null && !vertex.getInputVertices().isEmpty()) {
                    for (Vertex srcVertex : vertex.getInputVertices()) {
                        if (srcVertex.getParallelism() == -1) {
                            has1to1UninitedSources = true;
                            break;
                        }
                    }
                }
                if (has1to1UninitedSources) {
                    continue;
                }

                if (vertex.getVertexManagerPlugin() != null) {
                    continue;
                }
                throw new IllegalStateException(
                        vertex.getName() + " has -1 tasks but does not have input initializers, "
                                + "1-1 uninited sources or custom vertex manager to set it at runtime");
            }
        }
    }

    // AnnotatedVertex is used by verify()
    private static class AnnotatedVertex {
        Vertex v;

        int index; //for Tarjan's algorithm
        int lowlink; //for Tarjan's algorithm
        boolean onstack; //for Tarjan's algorithm

        private AnnotatedVertex(Vertex v) {
            this.v = v;
            index = -1;
            lowlink = -1;
        }
    }

    // verify()
    //
    // Default rules
    //   Illegal:
    //     - duplicate vertex id
    //     - cycles
    //
    //   Ok:
    //     - orphaned vertex.  Occurs in map-only
    //     - islands.  Occurs if job has unrelated workflows.
    //
    //   Not yet categorized:
    //     - orphaned vertex in DAG of >1 vertex.  Could be unrelated map-only job.
    //     - v1->v2 via two edges.  perhaps some self-join job would use this?
    //
    // "restricted" mode:
    //   In short term, the supported DAGs are limited. Call with restricted=true for these verifications.
    //   Illegal:
    //     - any vertex with more than one input or output edge. (n-ary input, n-ary merge)
    @VisibleForTesting
    void verify() throws IllegalStateException {
        verify(true);
    }

    @VisibleForTesting
    Deque<String> verify(boolean restricted) throws IllegalStateException {
        if (vertices.isEmpty()) {
            throw new IllegalStateException("Invalid dag containing 0 vertices");
        }

        // check for valid vertices, duplicate vertex names,
        // and prepare for cycle detection
        Map<String, AnnotatedVertex> vertexMap = new HashMap<String, AnnotatedVertex>();
        Map<Vertex, Set<String>> inboundVertexMap = new HashMap<Vertex, Set<String>>();
        Map<Vertex, Set<String>> outboundVertexMap = new HashMap<Vertex, Set<String>>();
        for (Vertex v : vertices.values()) {
            if (vertexMap.containsKey(v.getName())) {
                throw new IllegalStateException("DAG contains multiple vertices" + " with name: " + v.getName());
            }
            vertexMap.put(v.getName(), new AnnotatedVertex(v));
        }

        Map<Vertex, List<Edge>> edgeMap = new HashMap<Vertex, List<Edge>>();
        for (Edge e : edges) {
            // Construct structure for cycle detection
            Vertex inputVertex = e.getInputVertex();
            Vertex outputVertex = e.getOutputVertex();
            List<Edge> edgeList = edgeMap.get(inputVertex);
            if (edgeList == null) {
                edgeList = new ArrayList<Edge>();
                edgeMap.put(inputVertex, edgeList);
            }
            edgeList.add(e);

            // Construct map for Input name verification
            Set<String> inboundSet = inboundVertexMap.get(outputVertex);
            if (inboundSet == null) {
                inboundSet = new HashSet<String>();
                inboundVertexMap.put(outputVertex, inboundSet);
            }
            inboundSet.add(inputVertex.getName());

            // Construct map for Output name verification
            Set<String> outboundSet = outboundVertexMap.get(inputVertex);
            if (outboundSet == null) {
                outboundSet = new HashSet<String>();
                outboundVertexMap.put(inputVertex, outboundSet);
            }
            outboundSet.add(outputVertex.getName());
        }

        // check input and output names don't collide with vertex names
        for (Vertex vertex : vertices.values()) {
            for (RootInputLeafOutput<InputDescriptor, InputInitializerDescriptor> input : vertex.getInputs()) {
                if (vertexMap.containsKey(input.getName())) {
                    throw new IllegalStateException("Vertex: " + vertex.getName()
                            + " contains an Input with the same name as vertex: " + input.getName());
                }
            }
            for (RootInputLeafOutput<OutputDescriptor, OutputCommitterDescriptor> output : vertex.getOutputs()) {
                if (vertexMap.containsKey(output.getName())) {
                    throw new IllegalStateException("Vertex: " + vertex.getName()
                            + " contains an Output with the same name as vertex: " + output.getName());
                }
            }
        }

        // Check for valid InputNames
        for (Entry<Vertex, Set<String>> entry : inboundVertexMap.entrySet()) {
            Vertex vertex = entry.getKey();
            for (RootInputLeafOutput<InputDescriptor, InputInitializerDescriptor> input : vertex.getInputs()) {
                if (entry.getValue().contains(input.getName())) {
                    throw new IllegalStateException("Vertex: " + vertex.getName()
                            + " contains an incoming vertex and Input with the same name: " + input.getName());
                }
            }
        }

        // Check for valid OutputNames
        for (Entry<Vertex, Set<String>> entry : outboundVertexMap.entrySet()) {
            Vertex vertex = entry.getKey();
            for (RootInputLeafOutput<OutputDescriptor, OutputCommitterDescriptor> output : vertex.getOutputs()) {
                if (entry.getValue().contains(output.getName())) {
                    throw new IllegalStateException("Vertex: " + vertex.getName()
                            + " contains an outgoing vertex and Output with the same name: " + output.getName());
                }
            }
        }

        // Not checking for repeated input names / output names vertex names on the same vertex,
        // since we only allow 1 at the moment.
        // When additional inputs are supported, this can be chceked easily (and early)
        // within the addInput / addOutput call itself.

        Deque<String> topologicalVertexStack = detectCycles(edgeMap, vertexMap);

        checkAndInferOneToOneParallelism();

        if (restricted) {
            for (Edge e : edges) {
                DataSourceType dataSourceType = e.getEdgeProperty().getDataSourceType();
                if (dataSourceType != DataSourceType.PERSISTED && dataSourceType != DataSourceType.EPHEMERAL) {
                    throw new IllegalStateException("Unsupported source type on edge. " + e);
                }
            }
        }

        // check for conflicts between dag level local resource and vertex level local resource

        return topologicalVertexStack;
    }

    @VisibleForTesting
    void verifyLocalResources(Configuration tezConf) {
        for (Vertex v : vertices.values()) {
            for (Map.Entry<String, LocalResource> localResource : v.getTaskLocalFiles().entrySet()) {
                String resourceName = localResource.getKey();
                LocalResource resource = localResource.getValue();
                if (commonTaskLocalFiles.containsKey(resourceName)
                        && !commonTaskLocalFiles.get(resourceName).equals(resource)) {
                    // Different for some reason. Compare size, and then eventually hash
                    try {

                        LocalResource commonLr = commonTaskLocalFiles.get(resourceName);
                        if (resource.getSize() != commonLr.getSize()) {
                            throw new IllegalStateException("There is conflicting local resource (size mismatch) ("
                                    + resourceName + ") between dag local resource and vertex " + v.getName()
                                    + " local resource. " + "\nResource of dag : "
                                    + commonTaskLocalFiles.get(resourceName) + "\nResource of vertex: " + resource);
                        }

                        Path vertexResourcePath = ConverterUtils.getPathFromYarnURL(resource.getResource());
                        Path commonResourcePath = ConverterUtils.getPathFromYarnURL(commonLr.getResource());

                        byte[] vertexResourceSha = TezClientUtils.getResourceSha(vertexResourcePath.toUri(),
                                tezConf);
                        byte[] commonResourceSha = TezClientUtils.getResourceSha(commonResourcePath.toUri(),
                                tezConf);

                        if (!Arrays.equals(vertexResourceSha, commonResourceSha)) {
                            throw new IllegalStateException("There is conflicting local resource (sha mismatch) ("
                                    + resourceName + ") between dag local resource and vertex " + v.getName()
                                    + " local resource. " + "\nResource of dag : "
                                    + commonTaskLocalFiles.get(resourceName) + "\nResource of vertex: " + resource);
                        }

                    } catch (URISyntaxException | IOException e) {
                        throw new RuntimeException(
                                "Failed while attempting to validate sha for conflicting resources (" + resourceName
                                        + ") between dag local resource and vertex " + v.getName()
                                        + " local resource. " + "\nResource of dag : "
                                        + commonTaskLocalFiles.get(resourceName) + "\nResource of vertex: "
                                        + resource);
                    }
                }
            }
        }
    }

    // Adaptation of Tarjan's algorithm for connected components.
    // http://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm
    private Deque<String> detectCycles(Map<Vertex, List<Edge>> edgeMap, Map<String, AnnotatedVertex> vertexMap)
            throws IllegalStateException {
        Deque<String> topologicalVertexStack = new LinkedList<String>();
        Integer nextIndex = 0; // boxed integer so it is passed by reference.
        Stack<AnnotatedVertex> stack = new Stack<DAG.AnnotatedVertex>();
        for (AnnotatedVertex av : vertexMap.values()) {
            if (av.index == -1) {
                assert stack.empty();
                strongConnect(av, vertexMap, edgeMap, stack, nextIndex, topologicalVertexStack);
            }
        }
        return topologicalVertexStack;
    }

    // part of Tarjan's algorithm for connected components.
    // http://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm
    private void strongConnect(AnnotatedVertex av, Map<String, AnnotatedVertex> vertexMap,
            Map<Vertex, List<Edge>> edgeMap, Stack<AnnotatedVertex> stack, Integer nextIndex,
            Deque<String> topologicalVertexStack) throws IllegalStateException {
        av.index = nextIndex;
        av.lowlink = nextIndex;
        nextIndex++;
        stack.push(av);
        av.onstack = true;

        List<Edge> edges = edgeMap.get(av.v);
        if (edges != null) {
            for (Edge e : edgeMap.get(av.v)) {
                AnnotatedVertex outVertex = vertexMap.get(e.getOutputVertex().getName());
                if (outVertex.index == -1) {
                    strongConnect(outVertex, vertexMap, edgeMap, stack, nextIndex, topologicalVertexStack);
                    av.lowlink = Math.min(av.lowlink, outVertex.lowlink);
                } else if (outVertex.onstack) {
                    // strongly connected component detected, but we will wait till later so that the full cycle can be displayed.
                    // update lowlink in case outputVertex should be considered the root of this component.
                    av.lowlink = Math.min(av.lowlink, outVertex.index);
                }
            }
        }

        if (av.lowlink == av.index) {
            AnnotatedVertex pop = stack.pop();
            pop.onstack = false;
            if (pop != av) {
                // there was something on the stack other than this "av".
                // this indicates there is a scc/cycle. It comprises all nodes from top of stack to "av"
                StringBuilder message = new StringBuilder();
                message.append(av.v.getName()).append(" <- ");
                for (; pop != av; pop = stack.pop()) {
                    message.append(pop.v.getName()).append(" <- ");
                    pop.onstack = false;
                }
                message.append(av.v.getName());
                throw new IllegalStateException("DAG contains a cycle: " + message);
            } else {
                // detect self-cycle
                if (edgeMap.containsKey(pop.v)) {
                    for (Edge edge : edgeMap.get(pop.v)) {
                        if (edge.getOutputVertex().equals(pop.v)) {
                            throw new IllegalStateException(
                                    "DAG contains a self-cycle on vertex:" + pop.v.getName());
                        }
                    }
                }
            }
            topologicalVertexStack.push(av.v.getName());
        }
    }

    // create protobuf message describing DAG
    @Private
    public DAGPlan createDag(Configuration tezConf, Credentials extraCredentials,
            Map<String, LocalResource> tezJarResources, LocalResource binaryConfig, boolean tezLrsAsArchive) {
        return createDag(tezConf, extraCredentials, tezJarResources, binaryConfig, tezLrsAsArchive, null, null);
    }

    // create protobuf message describing DAG
    @Private
    public synchronized DAGPlan createDag(Configuration tezConf, Credentials extraCredentials,
            Map<String, LocalResource> tezJarResources, LocalResource binaryConfig, boolean tezLrsAsArchive,
            ServicePluginsDescriptor servicePluginsDescriptor, JavaOptsChecker javaOptsChecker) {
        Deque<String> topologicalVertexStack = verify(true);
        verifyLocalResources(tezConf);

        DAGPlan.Builder dagBuilder = DAGPlan.newBuilder();
        dagBuilder.setName(this.name);

        if (this.callerContext != null) {
            dagBuilder.setCallerContext(DagTypeConverters.convertCallerContextToProto(callerContext));
        }
        if (this.dagInfo != null && !this.dagInfo.isEmpty()) {
            dagBuilder.setDagInfo(this.dagInfo);
        }

        // Setup default execution context.
        VertexExecutionContext defaultContext = getDefaultExecutionContext();
        verifyExecutionContext(defaultContext, servicePluginsDescriptor, "DAGDefault");
        if (defaultContext != null) {
            DAGProtos.VertexExecutionContextProto contextProto = DagTypeConverters.convertToProto(defaultContext);
            dagBuilder.setDefaultExecutionContext(contextProto);
        }

        if (!vertexGroups.isEmpty()) {
            for (VertexGroup av : vertexGroups) {
                GroupInfo groupInfo = av.getGroupInfo();
                PlanVertexGroupInfo.Builder groupBuilder = PlanVertexGroupInfo.newBuilder();
                groupBuilder.setGroupName(groupInfo.getGroupName());
                for (Vertex v : groupInfo.getMembers()) {
                    groupBuilder.addGroupMembers(v.getName());
                }
                groupBuilder.addAllOutputs(groupInfo.outputs);
                for (Map.Entry<String, InputDescriptor> entry : groupInfo.edgeMergedInputs.entrySet()) {
                    groupBuilder.addEdgeMergedInputs(
                            PlanGroupInputEdgeInfo.newBuilder().setDestVertexName(entry.getKey())
                                    .setMergedInput(DagTypeConverters.convertToDAGPlan(entry.getValue())));
                }
                dagBuilder.addVertexGroups(groupBuilder);
            }
        }

        Credentials dagCredentials = new Credentials();
        if (extraCredentials != null) {
            dagCredentials.mergeAll(extraCredentials);
        }
        dagCredentials.mergeAll(credentials);
        if (!commonTaskLocalFiles.isEmpty()) {
            dagBuilder.addAllLocalResource(DagTypeConverters.convertToDAGPlan(commonTaskLocalFiles));
        }

        Preconditions.checkArgument(topologicalVertexStack.size() == vertices.size(),
                "size of topologicalVertexStack is:" + topologicalVertexStack.size() + " while size of vertices is:"
                        + vertices.size() + ", make sure they are the same in order to sort the vertices");
        while (!topologicalVertexStack.isEmpty()) {
            Vertex vertex = vertices.get(topologicalVertexStack.pop());
            // infer credentials, resources and parallelism from data source
            Resource vertexTaskResource = vertex.getTaskResource();
            if (vertexTaskResource == null) {
                vertexTaskResource = Resource.newInstance(
                        tezConf.getInt(TezConfiguration.TEZ_TASK_RESOURCE_MEMORY_MB,
                                TezConfiguration.TEZ_TASK_RESOURCE_MEMORY_MB_DEFAULT),
                        tezConf.getInt(TezConfiguration.TEZ_TASK_RESOURCE_CPU_VCORES,
                                TezConfiguration.TEZ_TASK_RESOURCE_CPU_VCORES_DEFAULT));
            }
            Map<String, LocalResource> vertexLRs = Maps.newHashMap();
            vertexLRs.putAll(vertex.getTaskLocalFiles());
            List<DataSourceDescriptor> dataSources = vertex.getDataSources();
            for (DataSourceDescriptor dataSource : dataSources) {
                if (dataSource.getCredentials() != null) {
                    dagCredentials.addAll(dataSource.getCredentials());
                }
                if (dataSource.getAdditionalLocalFiles() != null) {
                    TezCommonUtils.addAdditionalLocalResources(dataSource.getAdditionalLocalFiles(), vertexLRs,
                            "Vertex " + vertex.getName());
                }
            }
            if (tezJarResources != null) {
                TezCommonUtils.addAdditionalLocalResources(tezJarResources, vertexLRs,
                        "Vertex " + vertex.getName());
            }
            if (binaryConfig != null) {
                vertexLRs.put(TezConstants.TEZ_PB_BINARY_CONF_NAME, binaryConfig);
            }
            int vertexParallelism = vertex.getParallelism();
            VertexLocationHint vertexLocationHint = vertex.getLocationHint();
            if (dataSources.size() == 1) {
                DataSourceDescriptor dataSource = dataSources.get(0);
                if (vertexParallelism == -1 && dataSource.getNumberOfShards() > -1) {
                    vertexParallelism = dataSource.getNumberOfShards();
                }
                if (vertexLocationHint == null && dataSource.getLocationHint() != null) {
                    vertexLocationHint = dataSource.getLocationHint();
                }
            }
            if (vertexParallelism == -1) {
                Preconditions.checkState(vertexLocationHint == null,
                        "Cannot specify vertex location hint without specifying vertex parallelism. Vertex: "
                                + vertex.getName());
            } else if (vertexLocationHint != null) {
                Preconditions.checkState(vertexParallelism == vertexLocationHint.getTaskLocationHints().size(),
                        "vertex task location hint must equal vertex parallelism. Vertex: " + vertex.getName());
            }
            for (DataSinkDescriptor dataSink : vertex.getDataSinks()) {
                if (dataSink.getCredentials() != null) {
                    dagCredentials.addAll(dataSink.getCredentials());
                }
            }

            VertexPlan.Builder vertexBuilder = VertexPlan.newBuilder();
            vertexBuilder.setName(vertex.getName());
            vertexBuilder.setType(PlanVertexType.NORMAL); // vertex type is implicitly NORMAL until  TEZ-46.
            vertexBuilder
                    .setProcessorDescriptor(DagTypeConverters.convertToDAGPlan(vertex.getProcessorDescriptor()));

            // Vertex ExecutionContext setup
            VertexExecutionContext execContext = vertex.getVertexExecutionContext();
            verifyExecutionContext(execContext, servicePluginsDescriptor, vertex.getName());
            if (execContext != null) {
                DAGProtos.VertexExecutionContextProto contextProto = DagTypeConverters.convertToProto(execContext);
                vertexBuilder.setExecutionContext(contextProto);
            }
            // End of VertexExecutionContext setup.

            if (vertex.getInputs().size() > 0) {
                for (RootInputLeafOutput<InputDescriptor, InputInitializerDescriptor> input : vertex.getInputs()) {
                    vertexBuilder.addInputs(DagTypeConverters.convertToDAGPlan(input));
                }
            }
            if (vertex.getOutputs().size() > 0) {
                for (RootInputLeafOutput<OutputDescriptor, OutputCommitterDescriptor> output : vertex
                        .getOutputs()) {
                    vertexBuilder.addOutputs(DagTypeConverters.convertToDAGPlan(output));
                }
            }

            if (vertex.getConf() != null && vertex.getConf().size() > 0) {
                ConfigurationProto.Builder confBuilder = ConfigurationProto.newBuilder();
                TezUtils.populateConfProtoFromEntries(vertex.getConf().entrySet(), confBuilder);
                vertexBuilder.setVertexConf(confBuilder);
            }

            //task config
            PlanTaskConfiguration.Builder taskConfigBuilder = PlanTaskConfiguration.newBuilder();
            taskConfigBuilder.setNumTasks(vertexParallelism);
            taskConfigBuilder.setMemoryMb(vertexTaskResource.getMemory());
            taskConfigBuilder.setVirtualCores(vertexTaskResource.getVirtualCores());

            try {
                taskConfigBuilder.setJavaOpts(TezClientUtils
                        .addDefaultsToTaskLaunchCmdOpts(vertex.getTaskLaunchCmdOpts(), tezConf, javaOptsChecker));
            } catch (TezException e) {
                throw new TezUncheckedException(
                        "Invalid TaskLaunchCmdOpts defined for Vertex " + vertex.getName() + " : " + e.getMessage(),
                        e);
            }

            taskConfigBuilder.setTaskModule(vertex.getName());
            if (!vertexLRs.isEmpty()) {
                taskConfigBuilder.addAllLocalResource(DagTypeConverters.convertToDAGPlan(vertexLRs));
            }

            Map<String, String> taskEnv = Maps.newHashMap(vertex.getTaskEnvironment());
            TezYARNUtils.setupDefaultEnv(taskEnv, tezConf, TezConfiguration.TEZ_TASK_LAUNCH_ENV,
                    TezConfiguration.TEZ_TASK_LAUNCH_ENV_DEFAULT,
                    TezConfiguration.TEZ_TASK_LAUNCH_CLUSTER_DEFAULT_ENV,
                    TezConfiguration.TEZ_TASK_LAUNCH_CLUSTER_DEFAULT_ENV_DEFAULT, tezLrsAsArchive);
            for (Map.Entry<String, String> entry : taskEnv.entrySet()) {
                PlanKeyValuePair.Builder envSettingBuilder = PlanKeyValuePair.newBuilder();
                envSettingBuilder.setKey(entry.getKey());
                envSettingBuilder.setValue(entry.getValue());
                taskConfigBuilder.addEnvironmentSetting(envSettingBuilder);
            }

            if (vertexLocationHint != null) {
                if (vertexLocationHint.getTaskLocationHints() != null) {
                    for (TaskLocationHint hint : vertexLocationHint.getTaskLocationHints()) {
                        PlanTaskLocationHint.Builder taskLocationHintBuilder = PlanTaskLocationHint.newBuilder();
                        // we can allow this later on if needed
                        if (hint.getAffinitizedTask() != null) {
                            throw new TezUncheckedException(
                                    "Task based affinity may not be specified via the DAG API");
                        }

                        if (hint.getHosts() != null) {
                            taskLocationHintBuilder.addAllHost(hint.getHosts());
                        }
                        if (hint.getRacks() != null) {
                            taskLocationHintBuilder.addAllRack(hint.getRacks());
                        }

                        vertexBuilder.addTaskLocationHint(taskLocationHintBuilder);
                    }
                }
            }

            if (vertex.getVertexManagerPlugin() != null) {
                vertexBuilder.setVertexManagerPlugin(
                        DagTypeConverters.convertToDAGPlan(vertex.getVertexManagerPlugin()));
            }

            for (Edge inEdge : vertex.getInputEdges()) {
                vertexBuilder.addInEdgeId(inEdge.getId());
            }

            for (Edge outEdge : vertex.getOutputEdges()) {
                vertexBuilder.addOutEdgeId(outEdge.getId());
            }

            vertexBuilder.setTaskConfig(taskConfigBuilder);
            dagBuilder.addVertex(vertexBuilder);
        }

        for (Edge edge : edges) {
            EdgePlan.Builder edgeBuilder = EdgePlan.newBuilder();
            edgeBuilder.setId(edge.getId());
            edgeBuilder.setInputVertexName(edge.getInputVertex().getName());
            edgeBuilder.setOutputVertexName(edge.getOutputVertex().getName());
            edgeBuilder.setDataMovementType(
                    DagTypeConverters.convertToDAGPlan(edge.getEdgeProperty().getDataMovementType()));
            edgeBuilder.setDataSourceType(
                    DagTypeConverters.convertToDAGPlan(edge.getEdgeProperty().getDataSourceType()));
            edgeBuilder.setSchedulingType(
                    DagTypeConverters.convertToDAGPlan(edge.getEdgeProperty().getSchedulingType()));
            edgeBuilder.setEdgeSource(DagTypeConverters.convertToDAGPlan(edge.getEdgeProperty().getEdgeSource()));
            edgeBuilder.setEdgeDestination(
                    DagTypeConverters.convertToDAGPlan(edge.getEdgeProperty().getEdgeDestination()));
            if (edge.getEdgeProperty().getDataMovementType() == DataMovementType.CUSTOM) {
                if (edge.getEdgeProperty().getEdgeManagerDescriptor() != null) {
                    edgeBuilder.setEdgeManager(
                            DagTypeConverters.convertToDAGPlan(edge.getEdgeProperty().getEdgeManagerDescriptor()));
                } // else the AM will deal with this.
            }
            dagBuilder.addEdge(edgeBuilder);
        }

        if (dagAccessControls != null) {
            dagBuilder.setAclInfo(DagTypeConverters.convertDAGAccessControlsToProto(dagAccessControls));
        }

        ConfigurationProto.Builder confProtoBuilder = ConfigurationProto.newBuilder();
        if (!this.dagConf.isEmpty()) {
            TezUtils.populateConfProtoFromEntries(this.dagConf.entrySet(), confProtoBuilder);
        }
        // Copy historyLogLevel from tezConf into dagConf if its not overridden in dagConf.
        String logLevel = this.dagConf.get(TezConfiguration.TEZ_HISTORY_LOGGING_LOGLEVEL);
        if (logLevel != null) {
            // The config is from dagConf, we have already added it to the proto above, just check if
            // the value is valid.
            if (!HistoryLogLevel.validateLogLevel(logLevel)) {
                throw new IllegalArgumentException("Config: " + TezConfiguration.TEZ_HISTORY_LOGGING_LOGLEVEL
                        + " is set to invalid value: " + logLevel);
            }
        } else {
            // Validate and set value from tezConf.
            logLevel = tezConf.get(TezConfiguration.TEZ_HISTORY_LOGGING_LOGLEVEL);
            if (logLevel != null) {
                if (!HistoryLogLevel.validateLogLevel(logLevel)) {
                    throw new IllegalArgumentException("Config: " + TezConfiguration.TEZ_HISTORY_LOGGING_LOGLEVEL
                            + " is set to invalid value: " + logLevel);
                }
                PlanKeyValuePair.Builder kvp = PlanKeyValuePair.newBuilder();
                kvp.setKey(TezConfiguration.TEZ_HISTORY_LOGGING_LOGLEVEL);
                kvp.setValue(logLevel);
                confProtoBuilder.addConfKeyValues(kvp);
            }
        }
        dagBuilder.setDagConf(confProtoBuilder);

        if (dagCredentials != null) {
            dagBuilder.setCredentialsBinary(DagTypeConverters.convertCredentialsToProto(dagCredentials));
            TezCommonUtils.logCredentials(LOG, dagCredentials, "dag");
        }

        return dagBuilder.build();
    }

    private void verifyExecutionContext(VertexExecutionContext executionContext,
            ServicePluginsDescriptor servicePluginsDescriptor, String context) {
        if (executionContext != null) {
            if (executionContext.shouldExecuteInContainers()) {
                if (servicePluginsDescriptor == null || !servicePluginsDescriptor.areContainersEnabled()) {
                    throw new IllegalStateException("Invalid configuration. ExecutionContext for " + context
                            + " specifies container execution but this is disabled in the ServicePluginDescriptor");
                }
            }
            if (executionContext.shouldExecuteInAm()) {
                if (servicePluginsDescriptor == null || !servicePluginsDescriptor.isUberEnabled()) {
                    throw new IllegalStateException("Invalid configuration. ExecutionContext for " + context
                            + " specifies AM execution but this is disabled in the ServicePluginDescriptor");
                }
            }
            if (executionContext.getTaskSchedulerName() != null) {
                boolean found = false;
                if (servicePluginsDescriptor != null) {
                    found = checkNamedEntityExists(executionContext.getTaskSchedulerName(),
                            servicePluginsDescriptor.getTaskSchedulerDescriptors());
                }
                if (!found) {
                    throw new IllegalStateException("Invalid configuration. ExecutionContext for " + context
                            + " specifies task scheduler as " + executionContext.getTaskSchedulerName()
                            + " which is not part of the ServicePluginDescriptor");
                }
            }
            if (executionContext.getContainerLauncherName() != null) {
                boolean found = false;
                if (servicePluginsDescriptor != null) {
                    found = checkNamedEntityExists(executionContext.getContainerLauncherName(),
                            servicePluginsDescriptor.getContainerLauncherDescriptors());
                }
                if (!found) {
                    throw new IllegalStateException("Invalid configuration. ExecutionContext for " + context
                            + " specifies container launcher as " + executionContext.getContainerLauncherName()
                            + " which is not part of the ServicePluginDescriptor");
                }
            }
            if (executionContext.getTaskCommName() != null) {
                boolean found = false;
                if (servicePluginsDescriptor != null) {
                    found = checkNamedEntityExists(executionContext.getTaskCommName(),
                            servicePluginsDescriptor.getTaskCommunicatorDescriptors());
                }
                if (!found) {
                    throw new IllegalStateException("Invalid configuration. ExecutionContext for " + context
                            + " specifies task communicator as " + executionContext.getTaskCommName()
                            + " which is not part of the ServicePluginDescriptor");
                }
            }
        }
    }

    private boolean checkNamedEntityExists(String expected, NamedEntityDescriptor[] namedEntities) {
        if (namedEntities == null) {
            return false;
        }
        for (NamedEntityDescriptor named : namedEntities) {
            if (named.getEntityName().equals(expected)) {
                return true;
            }
        }
        return false;
    }

    public synchronized CallerContext getCallerContext() {
        return this.callerContext;
    }

}