co.cask.cdap.etl.planner.PipelinePlanner.java Source code

Introduction

Here is the source code for co.cask.cdap.etl.planner.PipelinePlanner.java
Source

/*
 * Copyright  2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.etl.planner;

import co.cask.cdap.etl.common.Constants;
import co.cask.cdap.etl.common.PipelinePhase;
import co.cask.cdap.etl.proto.Connection;
import co.cask.cdap.etl.spec.PipelineSpec;
import co.cask.cdap.etl.spec.StageSpec;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

/**
 * Takes a {@link PipelineSpec} and creates an execution plan from it.
 */
public class PipelinePlanner {
    private final Set<String> reduceTypes;
    private final Set<String> isolationTypes;
    private final Set<String> supportedPluginTypes;

    public PipelinePlanner(Set<String> supportedPluginTypes, Set<String> reduceTypes, Set<String> isolationTypes) {
        this.reduceTypes = ImmutableSet.copyOf(reduceTypes);
        this.isolationTypes = ImmutableSet.copyOf(isolationTypes);
        this.supportedPluginTypes = ImmutableSet.copyOf(supportedPluginTypes);
    }

    /**
     * Create an execution plan for the given logical pipeline. This is used for batch pipelines.
     * Though it may eventually be useful to mark windowing points for realtime pipelines.
     *
     * A plan consists of one or more phases, with connections between phases.
     * A connection between a phase indicates control flow, and not necessarily
     * data flow. This class assumes that it receives a valid pipeline spec.
     * That is, the pipeline has no cycles, all its nodes have unique names,
     * sources don't have any input, sinks don't have any output,
     * everything else has both an input and an output, etc.
     *
     * We start by inserting connector nodes into the logical dag,
     * which are used to mark boundaries between mapreduce jobs.
     * Each connector represents a node where we will need to write to a local dataset.
     *
     * Next, the logical pipeline is broken up into phases,
     * using the connectors as sinks in one phase, and a source in another.
     * After this point, connections between phases do not indicate data flow, but control flow.
     *
     * @param spec the pipeline spec, representing a logical pipeline
     * @return the execution plan
     */
    public PipelinePlan plan(PipelineSpec spec) {
        // go through the stages and examine their plugin type to determine which stages are reduce stages
        Set<String> reduceNodes = new HashSet<>();
        Set<String> isolationNodes = new HashSet<>();
        Map<String, StageSpec> specs = new HashMap<>();
        for (StageSpec stage : spec.getStages()) {
            if (reduceTypes.contains(stage.getPlugin().getType())) {
                reduceNodes.add(stage.getName());
            }
            if (isolationTypes.contains(stage.getPlugin().getType())) {
                isolationNodes.add(stage.getName());
            }
            specs.put(stage.getName(), stage);
        }

        // insert connector stages into the logical pipeline
        ConnectorDag cdag = ConnectorDag.builder().addConnections(spec.getConnections()).addReduceNodes(reduceNodes)
                .addIsolationNodes(isolationNodes).build();
        cdag.insertConnectors();
        Set<String> connectorNodes = cdag.getConnectors();

        // now split the logical pipeline into pipeline phases, using the connectors as split points
        Map<String, Dag> subdags = new HashMap<>();
        // assign some name to each subdag
        for (Dag subdag : cdag.splitOnConnectors()) {
            String name = getPhaseName(subdag.getSources(), subdag.getSinks());
            subdags.put(name, subdag);
        }

        // build connections between phases
        Set<Connection> phaseConnections = new HashSet<>();
        for (Map.Entry<String, Dag> subdagEntry1 : subdags.entrySet()) {
            String dag1Name = subdagEntry1.getKey();
            Dag dag1 = subdagEntry1.getValue();

            for (Map.Entry<String, Dag> subdagEntry2 : subdags.entrySet()) {
                String dag2Name = subdagEntry2.getKey();
                Dag dag2 = subdagEntry2.getValue();
                if (dag1Name.equals(dag2Name)) {
                    continue;
                }

                // if dag1 has any sinks that are a source in dag2, add a connection between the dags
                if (Sets.intersection(dag1.getSinks(), dag2.getSources()).size() > 0) {
                    phaseConnections.add(new Connection(dag1Name, dag2Name));
                }
            }
        }

        // convert to objects the programs expect.
        Map<String, PipelinePhase> phases = new HashMap<>();
        for (Map.Entry<String, Dag> dagEntry : subdags.entrySet()) {
            phases.put(dagEntry.getKey(), dagToPipeline(dagEntry.getValue(), connectorNodes, specs));
        }
        return new PipelinePlan(phases, phaseConnections);
    }

    /**
     * Converts a Dag into a PipelinePhase, using what we know about the plugin type of each node in the dag.
     * The PipelinePhase is what programs will take as input, and keeps track of sources, transforms, sinks, etc.
     *
     * @param dag the dag to convert
     * @param connectors connector nodes across all dags
     * @param specs specifications for every stage
     * @return the converted dag
     */
    private PipelinePhase dagToPipeline(Dag dag, Set<String> connectors, Map<String, StageSpec> specs) {
        PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes);

        for (String stageName : dag.getTopologicalOrder()) {
            Set<String> outputs = dag.getNodeOutputs(stageName);
            if (!outputs.isEmpty()) {
                phaseBuilder.addConnections(stageName, outputs);
            }

            // add connectors
            if (connectors.contains(stageName)) {
                phaseBuilder.addStage(Constants.CONNECTOR_TYPE, new StageInfo(stageName, null));
                continue;
            }

            // add other plugin types
            StageSpec spec = specs.get(stageName);
            String pluginType = spec.getPlugin().getType();
            StageInfo stageInfo = new StageInfo(stageName, spec.getErrorDatasetName());
            phaseBuilder.addStage(pluginType, stageInfo);
        }

        return phaseBuilder.build();
    }

    @VisibleForTesting
    static String getPhaseName(Set<String> sources, Set<String> sinks) {
        // using sorted sets to guarantee the name is deterministic
        return Joiner.on('.').join(new TreeSet<>(sources)) + ".to." + Joiner.on('.').join(new TreeSet<>(sinks));
    }
}