Source code

Java tutorial


Here is the source code for


 * Copyright  2016 Cask Data, Inc.
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.

package co.cask.cdap.etl.planner;

import co.cask.cdap.etl.proto.Connection;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;

 * A DAG (directed acyclic graph).
public class Dag {
    protected final Set<String> nodes;
    protected final Set<String> sources;
    protected final Set<String> sinks;
    // stage -> outputs of that stage
    protected final SetMultimap<String, String> outgoingConnections;
    // stage -> inputs for that stage
    protected final SetMultimap<String, String> incomingConnections;

    public Dag(Collection<Connection> connections) {
        Preconditions.checkArgument(!connections.isEmpty(), "Cannot create a DAG without any connections");
        this.outgoingConnections = HashMultimap.create();
        this.incomingConnections = HashMultimap.create();
        for (Connection connection : connections) {
            outgoingConnections.put(connection.getFrom(), connection.getTo());
            incomingConnections.put(connection.getTo(), connection.getFrom());
        this.sources = new HashSet<>();
        this.sinks = new HashSet<>();
        this.nodes = new HashSet<>();

    private Dag(SetMultimap<String, String> outgoingConnections, SetMultimap<String, String> incomingConnections) {
        this.outgoingConnections = HashMultimap.create(outgoingConnections);
        this.incomingConnections = HashMultimap.create(incomingConnections);
        this.sources = new HashSet<>();
        this.sinks = new HashSet<>();
        this.nodes = new HashSet<>();

     * Validate the DAG is a valid DAG without cycles, and no islands. This should only be called before any
     * mutating operations like {@link #removeSource()} are called.
     * @throws IllegalStateException if there is a cycle in the graph, or an island in the graph
    void validate() {
        // if there are no sources, we must have a cycle.
        if (sources.isEmpty()) {
            throw new IllegalStateException("DAG does not have any sources. Please remove cycles from the graph.");
        // similarly, if there are no sinks, we must have a cycle
        if (sinks.isEmpty()) {
            throw new IllegalStateException("DAG does not have any sinks. Please remove cycles from the graph.");

        // check for cycles

        // check for sections of the dag that are on an island by themselves

        // source -> [ nodes accessible by the source ]
        Map<String, Set<String>> nodesAccessibleBySources = new HashMap<>();
        for (String source : sources) {
            nodesAccessibleBySources.put(source, accessibleFrom(source));

        // the algorithm is to keep an island and try to keep adding to it until we can't anymore.
        // the island starts off as the nodes accessible by the first source
        // we then loop through all other sources and add them to the island if they can access any node in the island.
        // we stop if we ever loop through all other sources and can't add them to the island,
        // or if the island has grown to include all sources.
        Set<String> islandNodes = new HashSet<>();
        // seed the island with the first source
        Set<String> potentialIslandSources = new HashSet<>(sources);
        String firstSource = potentialIslandSources.iterator().next();

        while (!potentialIslandSources.isEmpty()) {
            Set<String> sourcesAdded = new HashSet<>();
            // for each source not yet a part of the island
            for (String potentialIslandSource : potentialIslandSources) {
                Set<String> accessibleBySource = nodesAccessibleBySources.get(potentialIslandSource);
                // if that source can access the island in any way, add it to the island
                if (!Sets.intersection(islandNodes, accessibleBySource).isEmpty()) {
            // if this is empty, no sources were added to the island. That means the island really is an island.
            if (sourcesAdded.isEmpty()) {
                throw new IllegalStateException(String.format(
                        "Invalid DAG. There is an island made up of stages %s (no other stages connect to them).",

    public Set<String> getNodes() {
        return nodes;

    public Set<String> getSources() {
        return Collections.unmodifiableSet(sources);

    public Set<String> getSinks() {
        return Collections.unmodifiableSet(sinks);

    public Set<String> getNodeOutputs(String node) {
        return Collections.unmodifiableSet(outgoingConnections.get(node));

    public Set<String> getNodeInputs(String node) {
        return Collections.unmodifiableSet(incomingConnections.get(node));

     * Return all stages accessible from the specified stage.
     * @param stage the stage to start at
     * @return all stages accessible from that stage
    public Set<String> accessibleFrom(String stage) {
        return accessibleFrom(stage, ImmutableSet.<String>of());

     * Return all stages accessible from the specified stage, without going past any node in stopNodes.
     * @param stage the stage to start at
     * @param stopNodes set of nodes to stop traversal on
     * @return all stages accessible from that stage
    public Set<String> accessibleFrom(String stage, Set<String> stopNodes) {
        return accessibleFrom(ImmutableSet.of(stage), stopNodes);

     * Return all stages accessible from the specified stage, without going past any node in stopNodes.
     * @param stages the stages to start at
     * @param stopNodes set of nodes to stop traversal on
     * @return all stages accessible from that stage
    public Set<String> accessibleFrom(Set<String> stages, Set<String> stopNodes) {
        Set<String> accessible = new HashSet<>();
        for (String stage : stages) {
            accessible.addAll(traverse(stage, accessible, stopNodes));
        return accessible;

     * Return a subset of this dag starting from the specified stage.
     * This is equivalent to calling {@link #subsetFrom(String, Set)} with an empty set for stop nodes
     * @param stage the stage to start at
     * @return a dag created from the nodes accessible from the specified stage
    public Dag subsetFrom(String stage) {
        return subsetFrom(stage, ImmutableSet.<String>of());

     * Return a subset of this dag starting from the specified stage, without going past any node in stopNodes.
     * This is equivalent to taking the nodes from {@link #accessibleFrom(String, Set)} and building a dag from them.
     * @param stage the stage to start at
     * @param stopNodes set of nodes to stop traversal on
     * @return a dag created from the nodes accessible from the specified stage
    public Dag subsetFrom(String stage, Set<String> stopNodes) {
        return subsetFrom(ImmutableSet.of(stage), stopNodes);

     * Return a subset of this dag starting from the specified stages.
     * This is equivalent to calling {@link #subsetFrom(Set, Set)} with an empty set for stop nodes
     * @param stages the stage to start at
     * @return a dag created from the nodes accessible from the specified stage
    public Dag subsetFrom(Set<String> stages) {
        return subsetFrom(stages, ImmutableSet.<String>of());

     * Return a subset of this dag starting from the specified stage, without going past any node in stopNodes.
     * This is equivalent to taking the nodes from {@link #accessibleFrom(Set, Set)} and building a dag from them.
     * @param stages the stages to start at
     * @param stopNodes set of nodes to stop traversal on
     * @return a dag created from the nodes accessible from the specified stage
    public Dag subsetFrom(Set<String> stages, Set<String> stopNodes) {
        Set<String> nodes = accessibleFrom(stages, stopNodes);
        Set<Connection> connections = new HashSet<>();
        for (String node : nodes) {
            for (String outputNode : outgoingConnections.get(node)) {
                if (nodes.contains(outputNode)) {
                    connections.add(new Connection(node, outputNode));
        return new Dag(connections);

     * Get the dag in topological order.
     * The returned list guarantees that for each item in the list, that item has no path to an
     * item that comes before it in the list. In the process, if a cycle is found, an exception will be thrown.
     * Topological sort means we pop off a source from the dag, re-calculate sources, and continue until there
     * are no more nodes left. Popping will be done on a copy of the dag so that this is not a destructive operation.
     * @return the dag in topological order
     * @throws IllegalStateException if there is a cycle in the dag
    public List<String> getTopologicalOrder() {
        List<String> linearized = new ArrayList<>();

        Dag copy = new Dag(outgoingConnections, incomingConnections);
        String removed;
        while ((removed = copy.removeSource()) != null) {
        if (copy.outgoingConnections.isEmpty()) {
            return linearized;
        // if we've run out of sources to remove, but there are still connections, that means there is a cycle.
        // remove all sinks so we can print out where the cycle is.
        do {
            removed = copy.removeSink();
        } while (removed != null);
        Set<String> cycle = accessibleFrom(copy.outgoingConnections.keySet().iterator().next());
        throw new IllegalStateException(
                String.format("Invalid DAG. Stages %s form a cycle.", Joiner.on(',').join(cycle)));

     * Remove a source from the dag. New sources will be re-calculated after the source is removed.
     * @return the removed source, or null if there were no sources to remove.
    protected String removeSource() {
        if (sources.isEmpty()) {
            return null;
        String source = sources.iterator().next();
        return source;

     * Remove the specified connection. Does not check that the connection actually exists.
     * It is possible to break apart the dag with this call.
    protected void removeConnection(String from, String to) {
        outgoingConnections.remove(from, to);
        incomingConnections.remove(to, from);

     * Add the specified connection. It is possible to create a cycle with this call.
    protected void addConnection(String from, String to) {
        outgoingConnections.put(from, to);
        incomingConnections.put(to, from);

     * Remove a specific node from the dag. Removing a node will remove all connections into the node and all
     * connection coming out of the node. Removing a node will also re-compute the sources and sinks of the dag.
     * @param node the node to remove
    private void removeNode(String node) {
        // for each node this output to: node -> outputNode
        for (String outputNode : outgoingConnections.removeAll(node)) {
            // remove the connection from this node to its output
            incomingConnections.remove(outputNode, node);
            // if the removal of that connection caused the output to become a source, add it as a source
            if (incomingConnections.get(outputNode).isEmpty()) {
        // for each node that output to this node: inputNode -> node
        for (String inputNode : incomingConnections.removeAll(node)) {
            // remove the connection from the input node to this node
            outgoingConnections.remove(inputNode, node);
            // if the removal of that connection caused the input node to become a sink, add it as a sink
            if (outgoingConnections.get(inputNode).isEmpty()) {
        // in case this node we removed a source or a sink (or both).

    private Set<String> traverse(String stage, Set<String> alreadySeen, Set<String> stopNodes) {
        if (!alreadySeen.add(stage)) {
            return alreadySeen;
        Collection<String> outputs = outgoingConnections.get(stage);
        if (outputs.isEmpty()) {
            return alreadySeen;
        for (String output : outputs) {
            if (stopNodes.contains(output)) {
            alreadySeen.addAll(traverse(output, alreadySeen, stopNodes));
        return alreadySeen;

    private String removeSink() {
        if (sinks.isEmpty()) {
            return null;
        String sink = sinks.iterator().next();
        return sink;

    private void init() {
        for (String node : nodes) {
            if (outgoingConnections.get(node).isEmpty()) {
            if (incomingConnections.get(node).isEmpty()) {

    public boolean equals(Object o) {
        if (this == o) {
            return true;
        if (o == null || getClass() != o.getClass()) {
            return false;

        Dag that = (Dag) o;

        return Objects.equals(nodes, that.nodes) && Objects.equals(sources, that.sources)
                && Objects.equals(sinks, that.sinks)
                && Objects.equals(outgoingConnections, that.outgoingConnections)
                && Objects.equals(incomingConnections, that.incomingConnections);

    public int hashCode() {
        return Objects.hash(nodes, sources, sinks, outgoingConnections, incomingConnections);

    public String toString() {
        return "Dag{" + "nodes=" + nodes + ", sources=" + sources + ", sinks=" + sinks + ", outgoingConnections="
                + outgoingConnections + ", incomingConnections=" + incomingConnections + '}';