gobblin.data.management.conversion.hive.query.HiveAvroORCQueryGenerator.java Source code

Introduction

Here is the source code for gobblin.data.management.conversion.hive.query.HiveAvroORCQueryGenerator.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.data.management.conversion.hive.query;

import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import lombok.ToString;
import lombok.extern.slf4j.Slf4j;

import org.apache.avro.AvroRuntimeException;
import org.apache.avro.Schema;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;

import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;

import gobblin.configuration.State;
import gobblin.data.management.conversion.hive.entities.QueryBasedHivePublishEntity;

/***
 * Generate Hive queries
 */
@Slf4j
public class HiveAvroORCQueryGenerator {

    private static final String SERIALIZED_PUBLISH_TABLE_COMMANDS = "serialized.publish.table.commands";
    private static final Gson GSON = new GsonBuilder().setPrettyPrinting().create();

    // Table properties keys
    public static final String ORC_COMPRESSION_KEY = "orc.compress";
    public static final String ORC_ROW_INDEX_STRIDE_KEY = "orc.row.index.stride";

    // Default values for Hive DDL / DML query generation
    private static final String DEFAULT_DB_NAME = "default";
    private static final String DEFAULT_ROW_FORMAT_SERDE = "org.apache.hadoop.hive.ql.io.orc.OrcSerde";
    private static final String DEFAULT_ORC_INPUT_FORMAT = "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat";
    private static final String DEFAULT_ORC_OUTPUT_FORMAT = "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat";
    private static final String DEFAULT_ORC_COMPRESSION = "ZLIB";
    private static final String DEFAULT_ORC_ROW_INDEX_STRIDE = "268435456";
    private static final Properties DEFAULT_TBL_PROPERTIES = new Properties();
    static {
        DEFAULT_TBL_PROPERTIES.setProperty(ORC_COMPRESSION_KEY, DEFAULT_ORC_COMPRESSION);
        DEFAULT_TBL_PROPERTIES.setProperty(ORC_ROW_INDEX_STRIDE_KEY, DEFAULT_ORC_ROW_INDEX_STRIDE);
    }

    // Avro to Hive schema mapping
    private static final Map<Schema.Type, String> AVRO_TO_HIVE_COLUMN_MAPPING_V_12 = ImmutableMap.<Schema.Type, String>builder()
            .put(Schema.Type.NULL, "void").put(Schema.Type.BOOLEAN, "boolean").put(Schema.Type.INT, "int")
            .put(Schema.Type.LONG, "bigint").put(Schema.Type.FLOAT, "float").put(Schema.Type.DOUBLE, "double")
            .put(Schema.Type.BYTES, "binary").put(Schema.Type.STRING, "string").put(Schema.Type.RECORD, "struct")
            .put(Schema.Type.MAP, "map").put(Schema.Type.ARRAY, "array").put(Schema.Type.UNION, "uniontype")
            .put(Schema.Type.ENUM, "string").put(Schema.Type.FIXED, "binary").build();

    // Hive evolution types supported
    private static final Map<String, Set<String>> HIVE_COMPATIBLE_TYPES = ImmutableMap
            .<String, Set<String>>builder()
            .put("tinyint", ImmutableSet.<String>builder()
                    .add("smallint", "int", "bigint", "float", "double", "decimal", "string", "varchar").build())
            .put("smallint",
                    ImmutableSet.<String>builder()
                            .add("int", "bigint", "float", "double", "decimal", "string", "varchar").build())
            .put("int",
                    ImmutableSet.<String>builder().add("bigint", "float", "double", "decimal", "string", "varchar")
                            .build())
            .put("bigint",
                    ImmutableSet.<String>builder().add("float", "double", "decimal", "string", "varchar").build())
            .put("float", ImmutableSet.<String>builder().add("double", "decimal", "string", "varchar").build())
            .put("double", ImmutableSet.<String>builder().add("decimal", "string", "varchar").build())
            .put("decimal", ImmutableSet.<String>builder().add("string", "varchar").build())
            .put("string", ImmutableSet.<String>builder().add("double", "decimal", "varchar").build())
            .put("varchar", ImmutableSet.<String>builder().add("double", "string", "varchar").build())
            .put("timestamp", ImmutableSet.<String>builder().add("string", "varchar").build())
            .put("date", ImmutableSet.<String>builder().add("string", "varchar").build())
            .put("binary", Sets.<String>newHashSet()).put("boolean", Sets.<String>newHashSet()).build();

    @ToString
    public static enum COLUMN_SORT_ORDER {
        ASC("ASC"), DESC("DESC");

        private final String order;

        COLUMN_SORT_ORDER(String s) {
            order = s;
        }
    }

    /***
     * Generate DDL query to create a different format (default: ORC) Hive table for a given Avro Schema
     * @param schema Avro schema to use to generate the DDL for new Hive table
     * @param tblName New Hive table name
     * @param tblLocation New hive table location
     * @param optionalDbName Optional DB name, if not specified it defaults to 'default'
     * @param optionalPartitionDDLInfo Optional partition info in form of map of partition key, partition type pair
     *                                 If not specified, the table is assumed to be un-partitioned ie of type snapshot
     * @param optionalClusterInfo Optional cluster info
     * @param optionalSortOrderInfo Optional sort order
     * @param optionalNumOfBuckets Optional number of buckets
     * @param optionalRowFormatSerde Optional row format serde, default is ORC
     * @param optionalInputFormat Optional input format serde, default is ORC
     * @param optionalOutputFormat Optional output format serde, default is ORC
     * @param tableProperties Optional table properties
     * @param isEvolutionEnabled If schema evolution is turned on
     * @param destinationTableMeta Optional destination table metadata  @return Generated DDL query to create new Hive table
     */
    public static String generateCreateTableDDL(Schema schema, String tblName, String tblLocation,
            Optional<String> optionalDbName, Optional<Map<String, String>> optionalPartitionDDLInfo,
            Optional<List<String>> optionalClusterInfo,
            Optional<Map<String, COLUMN_SORT_ORDER>> optionalSortOrderInfo, Optional<Integer> optionalNumOfBuckets,
            Optional<String> optionalRowFormatSerde, Optional<String> optionalInputFormat,
            Optional<String> optionalOutputFormat, Properties tableProperties, boolean isEvolutionEnabled,
            Optional<Table> destinationTableMeta, Map<String, String> hiveColumns) {

        Preconditions.checkNotNull(schema);
        Preconditions.checkArgument(StringUtils.isNotBlank(tblName));
        Preconditions.checkArgument(StringUtils.isNotBlank(tblLocation));

        String dbName = optionalDbName.isPresent() ? optionalDbName.get() : DEFAULT_DB_NAME;
        String rowFormatSerde = optionalRowFormatSerde.isPresent() ? optionalRowFormatSerde.get()
                : DEFAULT_ROW_FORMAT_SERDE;
        String inputFormat = optionalInputFormat.isPresent() ? optionalInputFormat.get() : DEFAULT_ORC_INPUT_FORMAT;
        String outputFormat = optionalOutputFormat.isPresent() ? optionalOutputFormat.get()
                : DEFAULT_ORC_OUTPUT_FORMAT;
        tableProperties = getTableProperties(tableProperties);

        // Start building Hive DDL
        // Refer to Hive DDL manual for explanation of clauses:
        // https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-Create/Drop/TruncateTable
        StringBuilder ddl = new StringBuilder();

        // Create statement
        ddl.append(String.format("CREATE EXTERNAL TABLE IF NOT EXISTS `%s`.`%s` ", dbName, tblName));
        // .. open bracket for CREATE
        ddl.append("( \n");

        // 1. If evolution is enabled, and destination table does not exists
        //    .. use columns from new schema
        //    (evolution does not matter if its new destination table)
        // 2. If evolution is enabled, and destination table does exists
        //    .. use columns from new schema
        //    (alter table will be used before moving data from staging to final table)
        // 3. If evolution is disabled, and destination table does not exists
        //    .. use columns from new schema
        //    (evolution does not matter if its new destination table)
        // 4. If evolution is disabled, and destination table does exists
        //    .. use columns from destination schema
        if (isEvolutionEnabled || !destinationTableMeta.isPresent()) {
            log.info("Generating DDL using source schema");
            ddl.append(generateAvroToHiveColumnMapping(schema, Optional.of(hiveColumns), true));
        } else {
            log.info("Generating DDL using destination schema");
            ddl.append(
                    generateDestinationToHiveColumnMapping(Optional.of(hiveColumns), destinationTableMeta.get()));
        }

        // .. close bracket for CREATE
        ddl.append(") \n");

        // Partition info
        if (optionalPartitionDDLInfo.isPresent() && optionalPartitionDDLInfo.get().size() > 0) {
            ddl.append("PARTITIONED BY ( ");
            boolean isFirst = true;
            Map<String, String> partitionInfoMap = optionalPartitionDDLInfo.get();
            for (Map.Entry<String, String> partitionInfo : partitionInfoMap.entrySet()) {
                if (isFirst) {
                    isFirst = false;
                } else {
                    ddl.append(", ");
                }
                ddl.append(String.format("`%s` %s", partitionInfo.getKey(), partitionInfo.getValue()));
            }
            ddl.append(" ) \n");
        }

        if (optionalClusterInfo.isPresent()) {
            if (!optionalNumOfBuckets.isPresent()) {
                throw new IllegalArgumentException(("CLUSTERED BY requested, but no NUM_BUCKETS specified"));
            }
            ddl.append("CLUSTERED BY ( ");
            boolean isFirst = true;
            for (String clusterByCol : optionalClusterInfo.get()) {
                if (!hiveColumns.containsKey(clusterByCol)) {
                    throw new IllegalArgumentException(String.format(
                            "Requested CLUSTERED BY column: %s " + "is not present in schema", clusterByCol));
                }
                if (isFirst) {
                    isFirst = false;
                } else {
                    ddl.append(", ");
                }
                ddl.append(String.format("`%s`", clusterByCol));
            }
            ddl.append(" ) ");

            if (optionalSortOrderInfo.isPresent() && optionalSortOrderInfo.get().size() > 0) {
                Map<String, COLUMN_SORT_ORDER> sortOrderInfoMap = optionalSortOrderInfo.get();
                ddl.append("SORTED BY ( ");
                isFirst = true;
                for (Map.Entry<String, COLUMN_SORT_ORDER> sortOrderInfo : sortOrderInfoMap.entrySet()) {
                    if (!hiveColumns.containsKey(sortOrderInfo.getKey())) {
                        throw new IllegalArgumentException(
                                String.format("Requested SORTED BY column: %s " + "is not present in schema",
                                        sortOrderInfo.getKey()));
                    }
                    if (isFirst) {
                        isFirst = false;
                    } else {
                        ddl.append(", ");
                    }
                    ddl.append(String.format("`%s` %s", sortOrderInfo.getKey(), sortOrderInfo.getValue()));
                }
                ddl.append(" ) ");
            }
            ddl.append(String.format(" INTO %s BUCKETS %n", optionalNumOfBuckets.get()));
        } else {
            if (optionalSortOrderInfo.isPresent()) {
                throw new IllegalArgumentException("SORTED BY requested, but no CLUSTERED BY specified");
            }
        }

        // Field Terminal
        ddl.append("ROW FORMAT SERDE \n");
        ddl.append(String.format("  '%s' %n", rowFormatSerde));

        // Stored as ORC
        ddl.append("STORED AS INPUTFORMAT \n");
        ddl.append(String.format("  '%s' %n", inputFormat));
        ddl.append("OUTPUTFORMAT \n");
        ddl.append(String.format("  '%s' %n", outputFormat));

        // Location
        ddl.append("LOCATION \n");
        ddl.append(String.format("  '%s' %n", tblLocation));

        // Table properties
        if (null != tableProperties && tableProperties.size() > 0) {
            ddl.append("TBLPROPERTIES ( \n");
            boolean isFirst = true;
            for (String property : tableProperties.stringPropertyNames()) {
                if (isFirst) {
                    isFirst = false;
                } else {
                    ddl.append(", \n");
                }
                ddl.append(String.format("  '%s'='%s'", property, tableProperties.getProperty(property)));
            }
            ddl.append(") \n");
        }

        return ddl.toString();
    }

    private static Properties getTableProperties(Properties tableProperties) {
        if (null == tableProperties || tableProperties.size() == 0) {
            return DEFAULT_TBL_PROPERTIES;
        }

        for (String property : DEFAULT_TBL_PROPERTIES.stringPropertyNames()) {
            if (!tableProperties.containsKey(property)) {
                tableProperties.put(property, DEFAULT_TBL_PROPERTIES.get(property));
            }
        }

        return tableProperties;
    }

    /***
     * Generate DDL query to create a Hive partition pointing at specific location.
     * @param dbName Hive database name.
     * @param tableName Hive table name.
     * @param partitionLocation Physical location of partition.
     * @param partitionsDMLInfo Partitions DML info - a map of partition name and partition value.
     * @param format Hive partition file format
     * @return Commands to create a partition.
     */
    public static List<String> generateCreatePartitionDDL(String dbName, String tableName, String partitionLocation,
            Map<String, String> partitionsDMLInfo, Optional<String> format) {

        if (null == partitionsDMLInfo || partitionsDMLInfo.size() == 0) {
            return Collections.emptyList();
        }

        // Partition details
        StringBuilder partitionSpecs = new StringBuilder();
        partitionSpecs.append("PARTITION (");
        boolean isFirstPartitionSpec = true;
        for (Map.Entry<String, String> partition : partitionsDMLInfo.entrySet()) {
            if (isFirstPartitionSpec) {
                isFirstPartitionSpec = false;
            } else {
                partitionSpecs.append(", ");
            }
            partitionSpecs.append(String.format("`%s`='%s'", partition.getKey(), partition.getValue()));
        }
        partitionSpecs.append(") \n");

        // Create statement
        List<String> ddls = Lists.newArrayList();
        // Note: Hive does not support fully qualified Hive table names such as db.table for ALTER TABLE in v0.13
        // .. hence specifying 'use dbName' as a precursor to rename
        // Refer: HIVE-2496
        ddls.add(String.format("USE %s%n", dbName));
        if (format.isPresent()) {
            ddls.add(String.format("ALTER TABLE `%s` ADD IF NOT EXISTS %s FILEFORMAT %s LOCATION '%s' ", tableName,
                    partitionSpecs, format.get(), partitionLocation));
        } else {
            ddls.add(String.format("ALTER TABLE `%s` ADD IF NOT EXISTS %s LOCATION '%s' ", tableName,
                    partitionSpecs, partitionLocation));
        }

        return ddls;
    }

    public static List<String> generateCreatePartitionDDL(String dbName, String tableName, String partitionLocation,
            Map<String, String> partitionsDMLInfo) {
        return generateCreatePartitionDDL(dbName, tableName, partitionLocation, partitionsDMLInfo,
                Optional.<String>absent());
    }

    /***
     * Generate DDL query to drop a Hive table.
     * @param dbName Hive database name.
     * @param tableName Hive table name.
     * @return Command to drop the table.
     */
    public static String generateDropTableDDL(String dbName, String tableName) {
        return String.format("DROP TABLE IF EXISTS `%s`.`%s`", dbName, tableName);
    }

    /***
     * Adapt Avro schema / types to Hive column types
     * @param schema Schema to adapt and generate Hive columns with corresponding types
     * @param hiveColumns Optional Map to populate with the generated hive columns for reference of caller
     * @param topLevel If this is first level
     * @return Generate Hive columns with types for given Avro schema
     */
    private static String generateAvroToHiveColumnMapping(Schema schema, Optional<Map<String, String>> hiveColumns,
            boolean topLevel) {
        if (topLevel && !schema.getType().equals(Schema.Type.RECORD)) {
            throw new IllegalArgumentException(
                    String.format("Schema for table must be of type RECORD. Received type: %s", schema.getType()));
        }

        StringBuilder columns = new StringBuilder();
        boolean isFirst;
        switch (schema.getType()) {
        case RECORD:
            isFirst = true;
            if (topLevel) {
                for (Schema.Field field : schema.getFields()) {
                    if (isFirst) {
                        isFirst = false;
                    } else {
                        columns.append(", \n");
                    }
                    String type = generateAvroToHiveColumnMapping(field.schema(), hiveColumns, false);
                    if (hiveColumns.isPresent()) {
                        hiveColumns.get().put(field.name(), type);
                    }
                    String flattenSource = field.getProp("flatten_source");
                    if (StringUtils.isBlank(flattenSource)) {
                        flattenSource = field.name();
                    }
                    columns.append(String.format("  `%s` %s COMMENT 'from flatten_source %s'", field.name(), type,
                            flattenSource));
                }
            } else {
                columns.append(AVRO_TO_HIVE_COLUMN_MAPPING_V_12.get(schema.getType())).append("<");
                for (Schema.Field field : schema.getFields()) {
                    if (isFirst) {
                        isFirst = false;
                    } else {
                        columns.append(",");
                    }
                    String type = generateAvroToHiveColumnMapping(field.schema(), hiveColumns, false);
                    columns.append("`").append(field.name()).append("`").append(":").append(type);
                }
                columns.append(">");
            }
            break;
        case UNION:
            Optional<Schema> optionalType = isOfOptionType(schema);
            if (optionalType.isPresent()) {
                Schema optionalTypeSchema = optionalType.get();
                columns.append(generateAvroToHiveColumnMapping(optionalTypeSchema, hiveColumns, false));
            } else {
                columns.append(AVRO_TO_HIVE_COLUMN_MAPPING_V_12.get(schema.getType())).append("<");
                isFirst = true;
                for (Schema unionMember : schema.getTypes()) {
                    if (Schema.Type.NULL.equals(unionMember.getType())) {
                        continue;
                    }
                    if (isFirst) {
                        isFirst = false;
                    } else {
                        columns.append(",");
                    }
                    columns.append(generateAvroToHiveColumnMapping(unionMember, hiveColumns, false));
                }
                columns.append(">");
            }
            break;
        case MAP:
            columns.append(AVRO_TO_HIVE_COLUMN_MAPPING_V_12.get(schema.getType())).append("<");
            columns.append("string,")
                    .append(generateAvroToHiveColumnMapping(schema.getValueType(), hiveColumns, false));
            columns.append(">");
            break;
        case ARRAY:
            columns.append(AVRO_TO_HIVE_COLUMN_MAPPING_V_12.get(schema.getType())).append("<");
            columns.append(generateAvroToHiveColumnMapping(schema.getElementType(), hiveColumns, false));
            columns.append(">");
            break;
        case NULL:
            break;
        case BYTES:
        case DOUBLE:
        case ENUM:
        case FIXED:
        case FLOAT:
        case INT:
        case LONG:
        case STRING:
        case BOOLEAN:
            columns.append(AVRO_TO_HIVE_COLUMN_MAPPING_V_12.get(schema.getType()));
            break;
        default:
            String exceptionMessage = String.format("DDL query generation failed for \"%s\" ", schema);
            log.error(exceptionMessage);
            throw new AvroRuntimeException(exceptionMessage);
        }

        return columns.toString();
    }

    /***
     * Use destination table schema to generate column mapping
     * @param hiveColumns Optional Map to populate with the generated hive columns for reference of caller
     * @param destinationTableMeta destination table metadata
     * @return Generate Hive columns with types for given Avro schema
     */
    private static String generateDestinationToHiveColumnMapping(Optional<Map<String, String>> hiveColumns,
            Table destinationTableMeta) {
        StringBuilder columns = new StringBuilder();
        boolean isFirst = true;
        List<FieldSchema> fieldList = destinationTableMeta.getSd().getCols();
        for (FieldSchema field : fieldList) {
            if (isFirst) {
                isFirst = false;
            } else {
                columns.append(", \n");
            }
            String name = field.getName();
            String type = escapeHiveType(field.getType());
            String comment = field.getComment();
            if (hiveColumns.isPresent()) {
                hiveColumns.get().put(name, type);
            }
            columns.append(String.format("  `%s` %s COMMENT '%s'", name, type, comment));
        }

        return columns.toString();
    }

    /***
     * Escape the Hive nested field names.
     * @param type Primitive or nested Hive type.
     * @return Escaped Hive nested field.
     */
    public static String escapeHiveType(String type) {
        TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type);

        // Primitve
        if (ObjectInspector.Category.PRIMITIVE.equals(typeInfo.getCategory())) {
            return type;
        }
        // List
        else if (ObjectInspector.Category.LIST.equals(typeInfo.getCategory())) {
            ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo;
            return org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME + "<"
                    + escapeHiveType(listTypeInfo.getListElementTypeInfo().getTypeName()) + ">";
        }
        // Map
        else if (ObjectInspector.Category.MAP.equals(typeInfo.getCategory())) {
            MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo;
            return org.apache.hadoop.hive.serde.serdeConstants.MAP_TYPE_NAME + "<"
                    + escapeHiveType(mapTypeInfo.getMapKeyTypeInfo().getTypeName()) + ","
                    + escapeHiveType(mapTypeInfo.getMapValueTypeInfo().getTypeName()) + ">";
        }
        // Struct
        else if (ObjectInspector.Category.STRUCT.equals(typeInfo.getCategory())) {
            StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
            List<String> allStructFieldNames = structTypeInfo.getAllStructFieldNames();
            List<TypeInfo> allStructFieldTypeInfos = structTypeInfo.getAllStructFieldTypeInfos();
            StringBuilder sb = new StringBuilder();
            sb.append(serdeConstants.STRUCT_TYPE_NAME + "<");
            for (int i = 0; i < allStructFieldNames.size(); i++) {
                if (i > 0) {
                    sb.append(",");
                }
                sb.append("`");
                sb.append(allStructFieldNames.get(i));
                sb.append("`");
                sb.append(":");
                sb.append(escapeHiveType(allStructFieldTypeInfos.get(i).getTypeName()));
            }
            sb.append(">");
            return sb.toString();
        }
        // Union
        else if (ObjectInspector.Category.UNION.equals(typeInfo.getCategory())) {
            UnionTypeInfo unionTypeInfo = (UnionTypeInfo) typeInfo;
            List<TypeInfo> allUnionObjectTypeInfos = unionTypeInfo.getAllUnionObjectTypeInfos();

            StringBuilder sb = new StringBuilder();
            sb.append(serdeConstants.UNION_TYPE_NAME + "<");
            for (int i = 0; i < allUnionObjectTypeInfos.size(); i++) {
                if (i > 0) {
                    sb.append(",");
                }
                sb.append(escapeHiveType(allUnionObjectTypeInfos.get(i).getTypeName()));
            }
            sb.append(">");
            return sb.toString();
        } else {
            throw new RuntimeException("Unknown type encountered: " + type);
        }
    }

    /***
     * Check if the Avro Schema is of type OPTION
     * ie. [null, TYPE] or [TYPE, null]
     * @param schema Avro Schema to check
     * @return Optional Avro Typed data if schema is of type OPTION
     */
    private static Optional<Schema> isOfOptionType(Schema schema) {
        Preconditions.checkNotNull(schema);

        // If not of type UNION, cant be an OPTION
        if (!Schema.Type.UNION.equals(schema.getType())) {
            return Optional.<Schema>absent();
        }

        // If has more than two members, can't be an OPTION
        List<Schema> types = schema.getTypes();
        if (null != types && types.size() == 2) {
            Schema first = types.get(0);
            Schema second = types.get(1);

            // One member should be of type NULL and other of non NULL type
            if (Schema.Type.NULL.equals(first.getType()) && !Schema.Type.NULL.equals(second.getType())) {
                return Optional.of(second);
            } else if (!Schema.Type.NULL.equals(first.getType()) && Schema.Type.NULL.equals(second.getType())) {
                return Optional.of(first);
            }
        }

        return Optional.<Schema>absent();
    }

    /***
     * Generate DML mapping query to populate output schema table by selecting from input schema table
     * This method assumes that each output schema field has a corresponding source input table's field reference
     * .. in form of 'flatten_source' property
     * @param inputAvroSchema Input schema that was used to obtain output schema (next argument)
     * @param outputOrcSchema Output schema (flattened or nested) that was generated using input schema
     *                        .. and has lineage information compatible with input schema
     * @param inputTblName Input table name
     * @param outputTblName Output table name
     * @param optionalInputDbName Optional input DB name, if not specified it will default to 'default'
     * @param optionalOutputDbName Optional output DB name, if not specified it will default to 'default'
     * @param optionalPartitionDMLInfo Optional partition info in form of map of partition key, partition value pairs
     * @param optionalOverwriteTable Optional overwrite table, if not specified it is set to true
     * @param optionalCreateIfNotExists Optional create if not exists, if not specified it is set to false
     * @param isEvolutionEnabled If schema evolution is turned on
     * @param destinationTableMeta Optional destination table metadata
     * @param rowLimit Optional row limit
     * @return DML query
     */
    public static String generateTableMappingDML(Schema inputAvroSchema, Schema outputOrcSchema,
            String inputTblName, String outputTblName, Optional<String> optionalInputDbName,
            Optional<String> optionalOutputDbName, Optional<Map<String, String>> optionalPartitionDMLInfo,
            Optional<Boolean> optionalOverwriteTable, Optional<Boolean> optionalCreateIfNotExists,
            boolean isEvolutionEnabled, Optional<Table> destinationTableMeta, Optional<Integer> rowLimit) {
        Preconditions.checkNotNull(inputAvroSchema);
        Preconditions.checkNotNull(outputOrcSchema);
        Preconditions.checkArgument(StringUtils.isNotBlank(inputTblName));
        Preconditions.checkArgument(StringUtils.isNotBlank(outputTblName));

        String inputDbName = optionalInputDbName.isPresent() ? optionalInputDbName.get() : DEFAULT_DB_NAME;
        String outputDbName = optionalOutputDbName.isPresent() ? optionalOutputDbName.get() : DEFAULT_DB_NAME;
        boolean shouldOverwriteTable = optionalOverwriteTable.isPresent() ? optionalOverwriteTable.get() : true;
        boolean shouldCreateIfNotExists = optionalCreateIfNotExists.isPresent() ? optionalCreateIfNotExists.get()
                : false;

        log.debug("Input Schema: " + inputAvroSchema.toString());
        log.debug("Output Schema: " + outputOrcSchema.toString());

        // Start building Hive DML
        // Refer to Hive DDL manual for explanation of clauses:
        // https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML#LanguageManualDML-InsertingdataintoHiveTablesfromqueries
        StringBuilder dmlQuery = new StringBuilder();

        // Insert query
        if (shouldOverwriteTable) {
            dmlQuery.append(String.format("INSERT OVERWRITE TABLE `%s`.`%s` %n", outputDbName, outputTblName));
        } else {
            dmlQuery.append(String.format("INSERT INTO TABLE `%s`.`%s` %n", outputDbName, outputTblName));
        }

        // Partition details
        if (optionalPartitionDMLInfo.isPresent()) {
            if (optionalPartitionDMLInfo.get().size() > 0) {
                dmlQuery.append("PARTITION (");
                boolean isFirstPartitionSpec = true;
                for (Map.Entry<String, String> partition : optionalPartitionDMLInfo.get().entrySet()) {
                    if (isFirstPartitionSpec) {
                        isFirstPartitionSpec = false;
                    } else {
                        dmlQuery.append(", ");
                    }
                    dmlQuery.append(String.format("`%s`='%s'", partition.getKey(), partition.getValue()));
                }
                dmlQuery.append(") \n");
            }
        }

        // If not exists
        if (shouldCreateIfNotExists) {
            dmlQuery.append(" IF NOT EXISTS \n");
        }

        // Select query
        dmlQuery.append("SELECT \n");

        // 1. If evolution is enabled, and destination table does not exists
        //    .. use columns from new schema
        //    (evolution does not matter if its new destination table)
        // 2. If evolution is enabled, and destination table does exists
        //    .. use columns from new schema
        //    (alter table will be used before moving data from staging to final table)
        // 3. If evolution is disabled, and destination table does not exists
        //    .. use columns from new schema
        //    (evolution does not matter if its new destination table)
        // 4. If evolution is disabled, and destination table does exists
        //    .. use columns from destination schema
        if (isEvolutionEnabled || !destinationTableMeta.isPresent()) {
            log.info("Generating DML using source schema");
            boolean isFirst = true;
            List<Schema.Field> fieldList = outputOrcSchema.getFields();
            for (Schema.Field field : fieldList) {
                String flattenSource = field.getProp("flatten_source");
                String colName;
                if (StringUtils.isNotBlank(flattenSource)) {
                    colName = flattenSource;
                } else {
                    colName = field.name();
                }
                // Escape the column name
                colName = colName.replaceAll("\\.", "`.`");

                if (isFirst) {
                    isFirst = false;
                } else {
                    dmlQuery.append(", \n");
                }
                dmlQuery.append(String.format("  `%s`", colName));
            }
        } else {
            log.info("Generating DML using destination schema");
            boolean isFirst = true;
            List<FieldSchema> fieldList = destinationTableMeta.get().getSd().getCols();
            for (FieldSchema field : fieldList) {
                String colName = StringUtils.EMPTY;
                if (field.isSetComment() && field.getComment().startsWith("from flatten_source ")) {
                    // Retrieve from column (flatten_source) from comment
                    colName = field.getComment().replaceAll("from flatten_source ", "").trim();
                } else {
                    // Or else find field in flattened schema
                    List<Schema.Field> evolvedFieldList = outputOrcSchema.getFields();
                    for (Schema.Field evolvedField : evolvedFieldList) {
                        if (evolvedField.name().equalsIgnoreCase(field.getName())) {
                            String flattenSource = evolvedField.getProp("flatten_source");
                            if (StringUtils.isNotBlank(flattenSource)) {
                                colName = flattenSource;
                            } else {
                                colName = evolvedField.name();
                            }
                            break;
                        }
                    }
                }
                // Escape the column name
                colName = colName.replaceAll("\\.", "`.`");

                // colName can be blank if it is deleted in new evolved schema, so we shouldn't try to fetch it
                if (StringUtils.isNotBlank(colName)) {
                    if (isFirst) {
                        isFirst = false;
                    } else {
                        dmlQuery.append(", \n");
                    }
                    dmlQuery.append(String.format("  `%s`", colName));
                }
            }
        }

        dmlQuery.append(String.format(" %n FROM `%s`.`%s` ", inputDbName, inputTblName));

        // Partition details
        if (optionalPartitionDMLInfo.isPresent()) {
            if (optionalPartitionDMLInfo.get().size() > 0) {
                dmlQuery.append("WHERE ");
                boolean isFirstPartitionSpec = true;
                for (Map.Entry<String, String> partition : optionalPartitionDMLInfo.get().entrySet()) {
                    if (isFirstPartitionSpec) {
                        isFirstPartitionSpec = false;
                    } else {
                        dmlQuery.append(" AND ");
                    }
                    dmlQuery.append(String.format("`%s`='%s'", partition.getKey(), partition.getValue()));
                }
                dmlQuery.append(" \n");
            }
        }

        // Limit clause
        if (rowLimit.isPresent()) {
            dmlQuery.append(String.format("LIMIT %s", rowLimit.get()));
        }

        return dmlQuery.toString();
    }

    public static Schema readSchemaFromString(String schemaStr) throws IOException {
        return new Schema.Parser().parse(schemaStr);
    }

    /***
     * Generate DDLs to evolve final destination table.
     * @param stagingTableName Staging table.
     * @param finalTableName Un-evolved final destination table.
     * @param optionalStagingDbName Optional staging database name, defaults to default.
     * @param optionalFinalDbName Optional final database name, defaults to default.
     * @param evolvedSchema Evolved Avro Schema.
     * @param isEvolutionEnabled Is schema evolution enabled.
     * @param evolvedColumns Evolved columns in Hive format.
     * @param destinationTableMeta Destination table metadata.
     * @return DDLs to evolve final destination table.
     */
    public static List<String> generateEvolutionDDL(String stagingTableName, String finalTableName,
            Optional<String> optionalStagingDbName, Optional<String> optionalFinalDbName, Schema evolvedSchema,
            boolean isEvolutionEnabled, Map<String, String> evolvedColumns, Optional<Table> destinationTableMeta) {
        // If schema evolution is disabled, then do nothing OR
        // If destination table does not exists, then do nothing
        if (!isEvolutionEnabled || !destinationTableMeta.isPresent()) {
            return Collections.emptyList();
        }

        String stagingDbName = optionalStagingDbName.isPresent() ? optionalStagingDbName.get() : DEFAULT_DB_NAME;
        String finalDbName = optionalFinalDbName.isPresent() ? optionalFinalDbName.get() : DEFAULT_DB_NAME;

        List<String> ddl = Lists.newArrayList();

        // Evolve schema
        Table destinationTable = destinationTableMeta.get();
        if (destinationTable.getSd().getCols().size() == 0) {
            log.warn("Desination Table: " + destinationTable + " does not has column details in StorageDescriptor. "
                    + "It is probably of Avro type. Cannot evolve via traditional HQL, so skipping evolution checks.");
            return ddl;
        }
        for (Map.Entry<String, String> evolvedColumn : evolvedColumns.entrySet()) {
            // Find evolved column in destination table
            boolean found = false;
            for (FieldSchema destinationField : destinationTable.getSd().getCols()) {
                if (destinationField.getName().equalsIgnoreCase(evolvedColumn.getKey())) {
                    // If evolved column is found, but type is evolved - evolve it
                    // .. if incompatible, isTypeEvolved will throw an exception
                    if (isTypeEvolved(evolvedColumn.getValue(), destinationField.getType())) {
                        ddl.add(String.format("USE %s%n", finalDbName));
                        ddl.add(String.format("ALTER TABLE `%s` CHANGE COLUMN %s %s %s COMMENT '%s'",
                                finalTableName, evolvedColumn.getKey(), evolvedColumn.getKey(),
                                evolvedColumn.getValue(), destinationField.getComment()));
                    }
                    found = true;
                    break;
                }
            }
            if (!found) {
                // If evolved column is not found ie. its new, add this column
                String flattenSource = evolvedSchema.getField(evolvedColumn.getKey()).getProp("flatten_source");
                if (StringUtils.isBlank(flattenSource)) {
                    flattenSource = evolvedSchema.getField(evolvedColumn.getKey()).name();
                }
                // Note: Hive does not support fully qualified Hive table names such as db.table for ALTER TABLE in v0.13
                // .. hence specifying 'use dbName' as a precursor to rename
                // Refer: HIVE-2496
                ddl.add(String.format("USE %s%n", finalDbName));
                ddl.add(String.format("ALTER TABLE `%s` ADD COLUMNS (%s %s COMMENT 'from flatten_source %s')",
                        finalTableName, evolvedColumn.getKey(), evolvedColumn.getValue(), flattenSource));
            }
        }

        return ddl;
    }

    /**
     * Generate DDL for dropping partitions of a table.
     * <p>
     * ALTER TABLE finalTableName DROP IF EXISTS PARTITION partition_spec, PARTITION partition_spec, ...;
     * </p>
     * @param finalTableName Table name where partitions are dropped
     * @param partitionsDMLInfo Partitions to be dropped
     * @return DDL to drop partitions in <code>finalTableName</code>
     */
    public static List<String> generateDropPartitionsDDL(final String dbName, final String finalTableName,
            final Map<String, String> partitionsDMLInfo) {

        if (null == partitionsDMLInfo || partitionsDMLInfo.isEmpty()) {
            return Collections.emptyList();
        }

        // Partition details
        StringBuilder partitionSpecs = new StringBuilder();
        partitionSpecs.append("PARTITION (");
        boolean isFirstPartitionSpec = true;
        for (Map.Entry<String, String> partition : partitionsDMLInfo.entrySet()) {
            if (isFirstPartitionSpec) {
                isFirstPartitionSpec = false;
            } else {
                partitionSpecs.append(", ");
            }
            partitionSpecs.append(String.format("`%s`='%s'", partition.getKey(), partition.getValue()));
        }
        partitionSpecs.append(") ");

        List<String> ddls = Lists.newArrayList();
        // Note: Hive does not support fully qualified Hive table names such as db.table for ALTER TABLE in v0.13
        // .. hence specifying 'use dbName' as a precursor to rename
        // Refer: HIVE-2496
        ddls.add(String.format("USE %s%n", dbName));
        ddls.add(String.format("ALTER TABLE %s DROP IF EXISTS %s", finalTableName, partitionSpecs));

        return ddls;
    }

    /**
     * Generate DDL for dropping partitions of a table.
     * <p>
     * ALTER TABLE finalTableName DROP IF EXISTS PARTITION partition_spec, PARTITION partition_spec, ...;
     * </p>
     * @param finalTableName Table name where partitions are dropped
     * @param partitionDMLInfos list of Partition to be dropped
     * @return DDL to drop partitions in <code>finalTableName</code>
     */
    public static List<String> generateDropPartitionsDDL(final String dbName, final String finalTableName,
            final List<Map<String, String>> partitionDMLInfos) {

        if (partitionDMLInfos.isEmpty()) {
            return Collections.emptyList();
        }

        List<String> ddls = Lists.newArrayList();
        ddls.add(String.format("USE %s %n", dbName));
        // Join the partition specs
        ddls.add(String.format("ALTER TABLE %s DROP IF EXISTS %s", finalTableName,
                Joiner.on(",").join(Iterables.transform(partitionDMLInfos, PARTITION_SPEC_GENERATOR))));

        return ddls;
    }

    /***
     * Generate DDL for creating and updating view over a table.
     *
     * Create view:
     * <p>
     *   CREATE VIEW IF NOT EXISTS db.viewName AS SELECT * FROM db.tableName
     * </p>
     *
     * Update view:
     * <p>
     *   ALTER VIEW db.viewName AS SELECT * FROM db.tableName
     * </p>
     *
     * @param tableDbName       Database for the table over which view has to be created.
     * @param tableName         Table over which view has to be created.
     * @param viewDbName        Database for the view to be created.
     * @param viewName          View to be created.
     * @param shouldUpdateView  If view should be forced re-built.
     * @return DDLs to create and / or update view over a table
     */
    public static List<String> generateCreateOrUpdateViewDDL(final String tableDbName, final String tableName,
            final String viewDbName, final String viewName, final boolean shouldUpdateView) {

        Preconditions.checkArgument(StringUtils.isNotBlank(tableName), "Table name should not be empty");
        Preconditions.checkArgument(StringUtils.isNotBlank(viewName), "View name should not be empty");

        // Resolve defaults
        String resolvedTableDbName = (StringUtils.isBlank(tableDbName)) ? DEFAULT_DB_NAME : tableDbName;
        String resolvedViewDbName = (StringUtils.isBlank(viewDbName)) ? DEFAULT_DB_NAME : viewDbName;

        List<String> ddls = Lists.newArrayList();

        // No-op if view already exists
        ddls.add(String.format("CREATE VIEW IF NOT EXISTS `%s`.`%s` AS SELECT * FROM `%s`.`%s`", resolvedViewDbName,
                viewName, resolvedTableDbName, tableName));

        // This will force re-build the view
        if (shouldUpdateView) {
            ddls.add(String.format("ALTER VIEW `%s`.`%s` AS SELECT * FROM `%s`.`%s`", resolvedViewDbName, viewName,
                    resolvedTableDbName, tableName));
        }

        return ddls;
    }

    /***
     * Generate DDL for updating file format of table or partition.
     * If partition spec is absent, DDL query to change storage format of Table is generated.
     *
     * Query syntax:
     * <p>
     *   ALTER TABLE tableName [PARTITION partition_spec] SET FILEFORMAT fileFormat
     * </p>
     *
     * @param dbName            Database for the table for which storage format needs to be changed.
     * @param tableName         Table for which storage format needs to be changed.
     * @param partitionsDMLInfo Optional partition spec for which storage format needs to be changed.
     * @param format            Storage format.
     * @return DDL to change storage format for Table or Partition.
     */
    public static List<String> generateAlterTableOrPartitionStorageFormatDDL(final String dbName,
            final String tableName, final Optional<Map<String, String>> partitionsDMLInfo, String format) {
        Preconditions.checkArgument(StringUtils.isNotBlank(tableName), "Table name should not be empty");
        Preconditions.checkArgument(StringUtils.isNotBlank(format), "Format should not be empty");

        // Resolve defaults
        String resolvedDbName = (StringUtils.isBlank(dbName)) ? DEFAULT_DB_NAME : dbName;

        // Partition details
        StringBuilder partitionSpecs = new StringBuilder();

        if (partitionsDMLInfo.isPresent()) {
            partitionSpecs.append("PARTITION (");
            boolean isFirstPartitionSpec = true;
            for (Map.Entry<String, String> partition : partitionsDMLInfo.get().entrySet()) {
                if (isFirstPartitionSpec) {
                    isFirstPartitionSpec = false;
                } else {
                    partitionSpecs.append(", ");
                }
                partitionSpecs.append(String.format("`%s`='%s'", partition.getKey(), partition.getValue()));
            }
            partitionSpecs.append(") ");
        }

        List<String> ddls = Lists.newArrayList();

        // Note: Hive does not support fully qualified Hive table names such as db.table for ALTER TABLE in v0.13
        // .. hence specifying 'use dbName' as a precursor to rename
        // Refer: HIVE-2496
        ddls.add(String.format("USE %s%n", resolvedDbName));
        ddls.add(String.format("ALTER TABLE %s %s SET FILEFORMAT %s", tableName, partitionSpecs, format));

        return ddls;
    }

    /***
     * Serialize a {@link QueryBasedHivePublishEntity} into a {@link State} at {@link #SERIALIZED_PUBLISH_TABLE_COMMANDS}.
     * @param state {@link State} to serialize entity into.
     * @param queryBasedHivePublishEntity to carry to publisher.
     */
    public static void serializePublishCommands(State state,
            QueryBasedHivePublishEntity queryBasedHivePublishEntity) {
        state.setProp(HiveAvroORCQueryGenerator.SERIALIZED_PUBLISH_TABLE_COMMANDS,
                GSON.toJson(queryBasedHivePublishEntity));
    }

    /***
     * Deserialize the publish entity from a {@link State} at {@link #SERIALIZED_PUBLISH_TABLE_COMMANDS}.
     * @param state {@link State} to look into for serialized entity.
     * @return Publish table entity.
     */
    public static QueryBasedHivePublishEntity deserializePublishCommands(State state) {
        QueryBasedHivePublishEntity queryBasedHivePublishEntity = GSON.fromJson(
                state.getProp(HiveAvroORCQueryGenerator.SERIALIZED_PUBLISH_TABLE_COMMANDS),
                QueryBasedHivePublishEntity.class);
        return queryBasedHivePublishEntity == null ? new QueryBasedHivePublishEntity()
                : queryBasedHivePublishEntity;
    }

    public static boolean isTypeEvolved(String evolvedType, String destinationType) {
        if (evolvedType.equalsIgnoreCase(destinationType)) {
            // Same type, not evolved
            return false;
        }
        // Look for compatibility in evolved type
        if (HIVE_COMPATIBLE_TYPES.containsKey(destinationType)) {
            if (HIVE_COMPATIBLE_TYPES.get(destinationType).contains(evolvedType)) {
                return true;
            } else {
                throw new RuntimeException(
                        String.format("Incompatible type evolution from: %s to: %s", destinationType, evolvedType));
            }
        } else {
            // We assume all complex types are compatible
            // TODO: Add compatibility check when ORC evolution supports complex types
            return true;
        }
    }

    /**
     * Generate partition spec in Hive standard syntax. (partition_column=partition_col_value, partition_column=partition_col_value, ...)
     */
    private static final Function<Map<String, String>, String> PARTITION_SPEC_GENERATOR = new Function<Map<String, String>, String>() {
        @Override
        public String apply(Map<String, String> partitionDMLInfo) {

            if (partitionDMLInfo == null) {
                return StringUtils.EMPTY;
            }
            return String.format(" PARTITION (%s)", Joiner.on(",").withKeyValueSeparator("=")
                    .join(Maps.transformValues(partitionDMLInfo, QUOTE_PARTITION_VALUES)));
        }
    };

    private static final Function<String, String> QUOTE_PARTITION_VALUES = new Function<String, String>() {

        @Override
        public String apply(String value) {
            return String.format("'%s'", value);
        }
    };
}