org.apache.hadoop.hive.ql.hooks.LineageInfo.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hive.ql.hooks.LineageInfo.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.hooks;

import java.io.Serializable;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;

import org.apache.commons.collections.SetUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.common.StringInternUtils;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.Table;

/**
 * This class contains the lineage information that is passed
 * to the PreExecution hook.
 */
public class LineageInfo implements Serializable {

    /**
     * Serial version id.
     */
    private static final long serialVersionUID = 1L;

    /**
     * Enum to track dependency. This enum has the following values:
     * 1. SIMPLE - Indicates that the column is derived from another table column
     *             with no transformations e.g. T2.c1 = T1.c1.
     * 2. EXPRESSION - Indicates that the column is derived from a UDF, UDAF, UDTF or
     *                 set operations like union on columns on other tables
     *                 e.g. T2.c1 = T1.c1 + T3.c1.
     * 4. SCRIPT - Indicates that the column is derived from the output
     *             of a user script through a TRANSFORM, MAP or REDUCE syntax
     *             or from the output of a PTF chain execution.
     */
    public static enum DependencyType {
        SIMPLE, EXPRESSION, SCRIPT
    }

    /**
     * Table or Partition data container. We need this class because the output
     * of the query can either go to a table or a partition within a table. The
     * data container class subsumes both of these.
     */
    public static class DataContainer implements Serializable {

        /**
         * Serial version id.
         */
        private static final long serialVersionUID = 1L;

        /**
         * The table in case this container is a table.
         */
        private final Table tab;

        /**
         * The partition in case this container is a partition.
         */
        private final Partition part;

        /**
         * Constructor for non partitioned tables.
         *
         * @param tab The associated table.
         */
        public DataContainer(Table tab) {
            this.tab = tab;
            this.part = null;
        }

        /**
         * Constructor for a partitioned tables partition.
         *
         * @param part The associated partition.
         */
        public DataContainer(Table tab, Partition part) {
            this.tab = tab;
            this.part = part;
        }

        /**
         * Returns true in case this data container is a partition.
         *
         * @return boolean TRUE if the container is a table partition.
         */
        public boolean isPartition() {
            return (part != null);
        }

        public Table getTable() {
            return this.tab;
        }

        public Partition getPartition() {
            return this.part;
        }

        @Override
        public String toString() {
            return isPartition() ? part.getDbName() + "." + part.getTableName() + "@" + part.getValues()
                    : tab.getDbName() + "." + tab.getTableName();
        }
    }

    /**
     * Class that captures the lookup key for the dependency. The dependency
     * is from (DataContainer, FieldSchema) to a Dependency structure. This
     * class captures the (DataContainer, FieldSchema) tuple.
     */
    public static class DependencyKey implements Serializable {

        /**
         * Serial version id.
         */
        private static final long serialVersionUID = 1L;

        /**
         * The data container for this key.
         */
        private final DataContainer dc;

        /**
         * The field schema for this key.
         */
        private final FieldSchema fld;

        /**
         * Constructor.
         *
         * @param dc The associated data container.
         * @param fld The associated field schema.
         */
        public DependencyKey(DataContainer dc, FieldSchema fld) {
            this.dc = dc;
            this.fld = fld;
        }

        public DataContainer getDataContainer() {
            return this.dc;
        }

        public FieldSchema getFieldSchema() {
            return this.fld;
        }

        /* (non-Javadoc)
         * @see java.lang.Object#hashCode()
         */
        @Override
        public int hashCode() {
            final int prime = 31;
            int result = 1;
            result = prime * result + ((dc == null) ? 0 : dc.hashCode());
            result = prime * result + ((fld == null) ? 0 : fld.hashCode());
            return result;
        }

        /* (non-Javadoc)
         * @see java.lang.Object#equals(java.lang.Object)
         */
        @Override
        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null) {
                return false;
            }
            if (getClass() != obj.getClass()) {
                return false;
            }
            DependencyKey other = (DependencyKey) obj;
            if (dc != other.dc) {
                return false;
            }
            if (fld != other.fld) {
                return false;
            }
            return true;
        }

        @Override
        public String toString() {
            return dc + ":" + fld;
        }
    }

    /**
     * Base Column information.
     */
    public static class BaseColumnInfo implements Serializable {

        /**
         * Serial version id.
         */
        private static final long serialVersionUID = 1L;

        /**
         * The table and alias info encapsulated in a different class.
         */
        private TableAliasInfo tabAlias;

        /**
         * The metastore column information. The column can be null
         * and that denotes that the expression is dependent on the row
         * of the table and not particular column. This can happen in case
         * of count(1).
         */
        private FieldSchema column;

        /**
         * @return the tabAlias
         */
        public TableAliasInfo getTabAlias() {
            return tabAlias;
        }

        /**
         * @param tabAlias the tabAlias to set
         */
        public void setTabAlias(TableAliasInfo tabAlias) {
            this.tabAlias = tabAlias;
        }

        /**
         * @return the column
         */
        public FieldSchema getColumn() {
            return column;
        }

        /**
         * @param column the column to set
         */
        public void setColumn(FieldSchema column) {
            this.column = column;
        }

        @Override
        public String toString() {
            return tabAlias + ":" + column;
        }

        @Override
        public int hashCode() {
            return (column != null ? column.hashCode() : 7) + (tabAlias != null ? tabAlias.hashCode() : 11);
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (!(obj instanceof BaseColumnInfo)) {
                return false;
            }
            BaseColumnInfo ci = (BaseColumnInfo) obj;
            return (column == null ? ci.column == null : column.equals(ci.column))
                    && (tabAlias == null ? ci.tabAlias == null : tabAlias.equals(ci.tabAlias));
        }
    }

    public static class TableAliasInfo implements Serializable {

        /**
         * Serail version id.
         */
        private static final long serialVersionUID = 1L;

        /**
         * The alias for the table.
         */
        private String alias;

        /**
         * The metastore table information.
         */
        private Table table;

        /**
         * @return the alias
         */
        public String getAlias() {
            return alias;
        }

        /**
         * @param alias the alias to set
         */
        public void setAlias(String alias) {
            this.alias = alias;
        }

        /**
         * @return the table
         */
        public Table getTable() {
            return table;
        }

        /**
         * @param table the table to set
         */
        public void setTable(Table table) {
            this.table = table;
        }

        @Override
        public String toString() {
            return table.getDbName() + "." + table.getTableName() + "(" + alias + ")";
        }

        @Override
        public int hashCode() {
            return (alias != null ? alias.hashCode() : 7) + (table != null ? table.hashCode() : 11);
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (!(obj instanceof TableAliasInfo)) {
                return false;
            }
            TableAliasInfo tabAlias = (TableAliasInfo) obj;
            return StringUtils.equals(alias, tabAlias.alias)
                    && (table == null ? tabAlias.table == null : table.equals(tabAlias.table));
        }
    }

    /**
     * This class tracks the dependency information for the base column.
     */
    public static class Dependency implements Serializable {

        /**
         *
         */
        private static final long serialVersionUID = 1L;

        /**
         * The type of dependency.
         */
        private DependencyType type;

        /**
         * Expression string for the dependency.
         */
        private String expr;

        /**
         * The set of base columns that the particular column depends on.
         */
        private Set<BaseColumnInfo> baseCols;

        /**
         * @return the type
         */
        public DependencyType getType() {
            return type;
        }

        /**
         * @param type the type to set
         */
        public void setType(DependencyType type) {
            this.type = type;
        }

        /**
         * @return the expr
         */
        public String getExpr() {
            return expr;
        }

        /**
         * @param expr the expr to set
         */
        public void setExpr(String expr) {
            this.expr = StringInternUtils.internIfNotNull(expr);
        }

        /**
         * @return the baseCols
         */
        public Set<BaseColumnInfo> getBaseCols() {
            return baseCols;
        }

        /**
         * @param baseCols the baseCols to set
         */
        public void setBaseCols(Set<BaseColumnInfo> baseCols) {
            this.baseCols = baseCols;
        }

        @Override
        public String toString() {
            return "[" + type + "]" + baseCols;
        }
    }

    /**
     * This class tracks the predicate information for an operator.
     */
    public static class Predicate {

        /**
         * Expression string for the predicate.
         */
        private String expr;

        /**
         * The set of base columns that the predicate depends on.
         */
        private Set<BaseColumnInfo> baseCols = new LinkedHashSet<BaseColumnInfo>();

        /**
         * @return the expr
         */
        public String getExpr() {
            return expr;
        }

        /**
         * @param expr the expr to set
         */
        public void setExpr(String expr) {
            this.expr = expr;
        }

        /**
         * @return the baseCols
         */
        public Set<BaseColumnInfo> getBaseCols() {
            return baseCols;
        }

        @Override
        public int hashCode() {
            return baseCols.hashCode() + (expr != null ? expr.hashCode() : 11);
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (!(obj instanceof Predicate)) {
                return false;
            }
            Predicate cond = (Predicate) obj;
            return StringUtils.equals(cond.expr, expr) && SetUtils.isEqualSet(cond.baseCols, baseCols);
        }
    }

    /**
     * The map contains an index from the (datacontainer, columnname) to the
     * dependency vector for that tuple. This is used to generate the
     * dependency vectors during the walk of the operator tree.
     */
    protected Map<DependencyKey, Dependency> index;

    /**
     * Constructor.
     */
    public LineageInfo() {
        index = Collections.synchronizedMap(new LinkedHashMap<DependencyKey, Dependency>());
    }

    /**
     * Gets the dependency for a table, column tuple.
     * @param dc The data container of the column whose dependency is being inspected.
     * @param col The column whose dependency is being inspected.
     * @return Dependency for that particular table, column tuple.
     *         null if no dependency is found.
     */
    public Dependency getDependency(DataContainer dc, FieldSchema col) {
        return index.get(new DependencyKey(dc, col));
    }

    /**
     * Puts the dependency for a table, column tuple.
     * @param dc The datacontainer whose dependency is being inserted.
     * @param col The column whose dependency is being inserted.
     * @param dep The dependency.
     */
    public void putDependency(DataContainer dc, FieldSchema col, Dependency dep) {
        index.put(new DependencyKey(dc, col), dep);
    }

    /**
     * Gets the entry set on this structure.
     *
     * @return LineageInfo entry set
     */
    public Set<Map.Entry<DependencyKey, Dependency>> entrySet() {
        return index.entrySet();
    }

    public void clear() {
        index.clear();
    }
}