org.apache.atlas.storm.hook.StormAtlasHook.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.atlas.storm.hook.StormAtlasHook.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.atlas.storm.hook;

import org.apache.storm.ISubmitterHook;
import org.apache.storm.generated.Bolt;
import org.apache.storm.generated.SpoutSpec;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.generated.TopologyInfo;
import org.apache.storm.utils.Utils;
import org.apache.atlas.AtlasClient;
import org.apache.atlas.AtlasConstants;
import org.apache.atlas.hive.bridge.HiveMetaStoreBridge;
import org.apache.atlas.hook.AtlasHook;
import org.apache.atlas.storm.model.StormDataTypes;
import org.apache.atlas.typesystem.Referenceable;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.slf4j.Logger;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Date;

/**
 * StormAtlasHook sends storm topology metadata information to Atlas
 * via a Kafka Broker for durability.
 * <p/>
 * This is based on the assumption that the same topology name is used
 * for the various lifecycle stages.
 */
public class StormAtlasHook extends AtlasHook implements ISubmitterHook {

    public static final Logger LOG = org.slf4j.LoggerFactory.getLogger(StormAtlasHook.class);

    private static final String CONF_PREFIX = "atlas.hook.storm.";
    private static final String HOOK_NUM_RETRIES = CONF_PREFIX + "numRetries";
    // will be used for owner if Storm topology does not contain the owner instance
    // possible if Storm is running in unsecure mode.
    public static final String ANONYMOUS_OWNER = "anonymous";

    public static final String HBASE_NAMESPACE_DEFAULT = "default";

    @Override
    protected String getNumberOfRetriesPropertyKey() {
        return HOOK_NUM_RETRIES;
    }

    /**
     * This is the client-side hook that storm fires when a topology is added.
     *
     * @param topologyInfo topology info
     * @param stormConf configuration
     * @param stormTopology a storm topology
     * @throws IllegalAccessException
     */
    @Override
    public void notify(TopologyInfo topologyInfo, Map stormConf, StormTopology stormTopology)
            throws IllegalAccessException {

        LOG.info("Collecting metadata for a new storm topology: {}", topologyInfo.get_name());
        try {
            ArrayList<Referenceable> entities = new ArrayList<>();
            Referenceable topologyReferenceable = createTopologyInstance(topologyInfo, stormConf);
            List<Referenceable> dependentEntities = addTopologyDataSets(stormTopology, topologyReferenceable,
                    topologyInfo.get_owner(), stormConf);
            if (dependentEntities.size() > 0) {
                entities.addAll(dependentEntities);
            }
            // create the graph for the topology
            ArrayList<Referenceable> graphNodes = createTopologyGraph(stormTopology, stormTopology.get_spouts(),
                    stormTopology.get_bolts());
            // add the connection from topology to the graph
            topologyReferenceable.set("nodes", graphNodes);
            entities.add(topologyReferenceable);

            LOG.debug("notifying entities, size = {}", entities.size());
            String user = getUser(topologyInfo.get_owner(), null);
            notifyEntities(user, entities);
        } catch (Exception e) {
            throw new RuntimeException("Atlas hook is unable to process the topology.", e);
        }
    }

    private Referenceable createTopologyInstance(TopologyInfo topologyInfo, Map stormConf) throws Exception {
        Referenceable topologyReferenceable = new Referenceable(StormDataTypes.STORM_TOPOLOGY.getName());
        topologyReferenceable.set("id", topologyInfo.get_id());
        topologyReferenceable.set(AtlasClient.NAME, topologyInfo.get_name());
        topologyReferenceable.set(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, topologyInfo.get_name());
        String owner = topologyInfo.get_owner();
        if (StringUtils.isEmpty(owner)) {
            owner = ANONYMOUS_OWNER;
        }
        topologyReferenceable.set(AtlasClient.OWNER, owner);
        topologyReferenceable.set("startTime", new Date(System.currentTimeMillis()));
        topologyReferenceable.set(AtlasConstants.CLUSTER_NAME_ATTRIBUTE, getClusterName(stormConf));

        return topologyReferenceable;
    }

    private List<Referenceable> addTopologyDataSets(StormTopology stormTopology,
            Referenceable topologyReferenceable, String topologyOwner, Map stormConf) throws Exception {
        List<Referenceable> dependentEntities = new ArrayList<>();
        // add each spout as an input data set
        addTopologyInputs(topologyReferenceable, stormTopology.get_spouts(), stormConf, topologyOwner,
                dependentEntities);
        // add the appropriate bolts as output data sets
        addTopologyOutputs(topologyReferenceable, stormTopology, topologyOwner, stormConf, dependentEntities);
        return dependentEntities;
    }

    private void addTopologyInputs(Referenceable topologyReferenceable, Map<String, SpoutSpec> spouts,
            Map stormConf, String topologyOwner, List<Referenceable> dependentEntities)
            throws IllegalAccessException {
        final ArrayList<Referenceable> inputDataSets = new ArrayList<>();
        for (Map.Entry<String, SpoutSpec> entry : spouts.entrySet()) {
            Serializable instance = Utils.javaDeserialize(entry.getValue().get_spout_object().get_serialized_java(),
                    Serializable.class);

            String simpleName = instance.getClass().getSimpleName();
            final Referenceable datasetRef = createDataSet(simpleName, topologyOwner, instance, stormConf,
                    dependentEntities);
            if (datasetRef != null) {
                inputDataSets.add(datasetRef);
            }
        }

        topologyReferenceable.set("inputs", inputDataSets);
    }

    private void addTopologyOutputs(Referenceable topologyReferenceable, StormTopology stormTopology,
            String topologyOwner, Map stormConf, List<Referenceable> dependentEntities) throws Exception {
        final ArrayList<Referenceable> outputDataSets = new ArrayList<>();

        Map<String, Bolt> bolts = stormTopology.get_bolts();
        Set<String> terminalBoltNames = StormTopologyUtil.getTerminalUserBoltNames(stormTopology);
        for (String terminalBoltName : terminalBoltNames) {
            Serializable instance = Utils.javaDeserialize(
                    bolts.get(terminalBoltName).get_bolt_object().get_serialized_java(), Serializable.class);

            String dataSetType = instance.getClass().getSimpleName();
            final Referenceable datasetRef = createDataSet(dataSetType, topologyOwner, instance, stormConf,
                    dependentEntities);
            if (datasetRef != null) {
                outputDataSets.add(datasetRef);
            }
        }

        topologyReferenceable.set("outputs", outputDataSets);
    }

    private Referenceable createDataSet(String name, String topologyOwner, Serializable instance, Map stormConf,
            List<Referenceable> dependentEntities) throws IllegalAccessException {
        Map<String, String> config = StormTopologyUtil.getFieldValues(instance, true, null);

        String clusterName = null;
        Referenceable dataSetReferenceable;
        // todo: need to redo this with a config driven approach
        switch (name) {
        case "KafkaSpout":
            dataSetReferenceable = new Referenceable(StormDataTypes.KAFKA_TOPIC.getName());
            final String topicName = config.get("KafkaSpout._spoutConfig.topic");
            dataSetReferenceable.set("topic", topicName);
            dataSetReferenceable.set("uri", config.get("KafkaSpout._spoutConfig.hosts.brokerZkStr"));
            if (StringUtils.isEmpty(topologyOwner)) {
                topologyOwner = ANONYMOUS_OWNER;
            }
            dataSetReferenceable.set(AtlasClient.OWNER, topologyOwner);
            dataSetReferenceable.set(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME,
                    getKafkaTopicQualifiedName(getClusterName(stormConf), topicName));
            dataSetReferenceable.set(AtlasClient.NAME, topicName);
            break;

        case "HBaseBolt":
            dataSetReferenceable = new Referenceable(StormDataTypes.HBASE_TABLE.getName());
            final String hbaseTableName = config.get("HBaseBolt.tableName");
            dataSetReferenceable.set("uri", stormConf.get("hbase.rootdir"));
            dataSetReferenceable.set(AtlasClient.NAME, hbaseTableName);
            dataSetReferenceable.set(AtlasClient.OWNER, stormConf.get("storm.kerberos.principal"));
            clusterName = extractComponentClusterName(HBaseConfiguration.create(), stormConf);
            //TODO - Hbase Namespace is hardcoded to 'default'. need to check how to get this or is it already part of tableName
            dataSetReferenceable.set(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME,
                    getHbaseTableQualifiedName(clusterName, HBASE_NAMESPACE_DEFAULT, hbaseTableName));
            break;

        case "HdfsBolt":
            dataSetReferenceable = new Referenceable(HiveMetaStoreBridge.HDFS_PATH);
            String hdfsUri = config.get("HdfsBolt.rotationActions") == null
                    ? config.get("HdfsBolt.fileNameFormat.path")
                    : config.get("HdfsBolt.rotationActions");
            final String hdfsPathStr = config.get("HdfsBolt.fsUrl") + hdfsUri;
            dataSetReferenceable.set(AtlasConstants.CLUSTER_NAME_ATTRIBUTE, getClusterName(stormConf));
            dataSetReferenceable.set(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, hdfsPathStr);
            dataSetReferenceable.set("path", hdfsPathStr);
            dataSetReferenceable.set(AtlasClient.OWNER, stormConf.get("hdfs.kerberos.principal"));
            final Path hdfsPath = new Path(hdfsPathStr);
            dataSetReferenceable.set(AtlasClient.NAME,
                    Path.getPathWithoutSchemeAndAuthority(hdfsPath).toString().toLowerCase());
            break;

        case "HiveBolt":
            // todo: verify if hive table has everything needed to retrieve existing table
            Referenceable dbReferenceable = new Referenceable("hive_db");
            String databaseName = config.get("HiveBolt.options.databaseName");
            dbReferenceable.set(AtlasClient.NAME, databaseName);
            dbReferenceable.set(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME,
                    HiveMetaStoreBridge.getDBQualifiedName(getClusterName(stormConf), databaseName));
            dbReferenceable.set(AtlasConstants.CLUSTER_NAME_ATTRIBUTE, getClusterName(stormConf));
            dependentEntities.add(dbReferenceable);
            clusterName = extractComponentClusterName(new HiveConf(), stormConf);
            final String hiveTableName = config.get("HiveBolt.options.tableName");
            dataSetReferenceable = new Referenceable("hive_table");
            final String tableQualifiedName = HiveMetaStoreBridge.getTableQualifiedName(clusterName, databaseName,
                    hiveTableName);
            dataSetReferenceable.set(AtlasClient.NAME, hiveTableName);
            dataSetReferenceable.set(HiveMetaStoreBridge.DB, dbReferenceable);
            dataSetReferenceable.set(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, tableQualifiedName);
            break;

        default:
            // custom node - create a base dataset class with name attribute
            //TODO - What should we do for custom data sets. Not sure what name we can set here?
            return null;
        }
        dependentEntities.add(dataSetReferenceable);

        return dataSetReferenceable;
    }

    private String extractComponentClusterName(Configuration configuration, Map stormConf) {
        String clusterName = configuration.get(AtlasConstants.CLUSTER_NAME_KEY, null);
        if (clusterName == null) {
            clusterName = getClusterName(stormConf);
        }
        return clusterName;
    }

    private ArrayList<Referenceable> createTopologyGraph(StormTopology stormTopology, Map<String, SpoutSpec> spouts,
            Map<String, Bolt> bolts) throws Exception {
        // Add graph of nodes in the topology
        final Map<String, Referenceable> nodeEntities = new HashMap<>();
        addSpouts(spouts, nodeEntities);
        addBolts(bolts, nodeEntities);

        addGraphConnections(stormTopology, nodeEntities);

        ArrayList<Referenceable> nodes = new ArrayList<>();
        nodes.addAll(nodeEntities.values());
        return nodes;
    }

    private void addSpouts(Map<String, SpoutSpec> spouts, Map<String, Referenceable> nodeEntities)
            throws IllegalAccessException {
        for (Map.Entry<String, SpoutSpec> entry : spouts.entrySet()) {
            final String spoutName = entry.getKey();
            Referenceable spoutReferenceable = createSpoutInstance(spoutName, entry.getValue());
            nodeEntities.put(spoutName, spoutReferenceable);
        }
    }

    private Referenceable createSpoutInstance(String spoutName, SpoutSpec stormSpout)
            throws IllegalAccessException {
        Referenceable spoutReferenceable = new Referenceable(StormDataTypes.STORM_SPOUT.getName());
        spoutReferenceable.set(AtlasClient.NAME, spoutName);

        Serializable instance = Utils.javaDeserialize(stormSpout.get_spout_object().get_serialized_java(),
                Serializable.class);
        spoutReferenceable.set("driverClass", instance.getClass().getName());

        Map<String, String> flatConfigMap = StormTopologyUtil.getFieldValues(instance, true, null);
        spoutReferenceable.set("conf", flatConfigMap);

        return spoutReferenceable;
    }

    private void addBolts(Map<String, Bolt> bolts, Map<String, Referenceable> nodeEntities)
            throws IllegalAccessException {
        for (Map.Entry<String, Bolt> entry : bolts.entrySet()) {
            Referenceable boltInstance = createBoltInstance(entry.getKey(), entry.getValue());
            nodeEntities.put(entry.getKey(), boltInstance);
        }
    }

    private Referenceable createBoltInstance(String boltName, Bolt stormBolt) throws IllegalAccessException {
        Referenceable boltReferenceable = new Referenceable(StormDataTypes.STORM_BOLT.getName());

        boltReferenceable.set(AtlasClient.NAME, boltName);

        Serializable instance = Utils.javaDeserialize(stormBolt.get_bolt_object().get_serialized_java(),
                Serializable.class);
        boltReferenceable.set("driverClass", instance.getClass().getName());

        Map<String, String> flatConfigMap = StormTopologyUtil.getFieldValues(instance, true, null);
        boltReferenceable.set("conf", flatConfigMap);

        return boltReferenceable;
    }

    private void addGraphConnections(StormTopology stormTopology, Map<String, Referenceable> nodeEntities)
            throws Exception {
        // adds connections between spouts and bolts
        Map<String, Set<String>> adjacencyMap = StormTopologyUtil.getAdjacencyMap(stormTopology, true);

        for (Map.Entry<String, Set<String>> entry : adjacencyMap.entrySet()) {
            String nodeName = entry.getKey();
            Set<String> adjacencyList = adjacencyMap.get(nodeName);
            if (adjacencyList == null || adjacencyList.isEmpty()) {
                continue;
            }

            // add outgoing links
            Referenceable node = nodeEntities.get(nodeName);
            ArrayList<String> outputs = new ArrayList<>(adjacencyList.size());
            outputs.addAll(adjacencyList);
            node.set("outputs", outputs);

            // add incoming links
            for (String adjacentNodeName : adjacencyList) {
                Referenceable adjacentNode = nodeEntities.get(adjacentNodeName);
                @SuppressWarnings("unchecked")
                ArrayList<String> inputs = (ArrayList<String>) adjacentNode.get("inputs");
                if (inputs == null) {
                    inputs = new ArrayList<>();
                }
                inputs.add(nodeName);
                adjacentNode.set("inputs", inputs);
            }
        }
    }

    public static String getKafkaTopicQualifiedName(String clusterName, String topicName) {
        return String.format("%s@%s", topicName, clusterName);
    }

    public static String getHbaseTableQualifiedName(String clusterName, String nameSpace, String tableName) {
        return String.format("%s.%s@%s", nameSpace, tableName, clusterName);
    }

    private String getClusterName(Map stormConf) {
        return atlasProperties.getString(AtlasConstants.CLUSTER_NAME_KEY, AtlasConstants.DEFAULT_CLUSTER_NAME);
    }
}