com.uber.hoodie.hive.client.SchemaUtil.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.hoodie.hive.client.SchemaUtil.java

Source

/*
 * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *           http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie.hive.client;

import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.uber.hoodie.hive.HoodieHiveDatasetException;
import com.uber.hoodie.hive.model.HoodieDatasetReference;
import com.uber.hoodie.hive.model.SchemaDifference;
import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import parquet.schema.DecimalMetadata;
import parquet.schema.GroupType;
import parquet.schema.MessageType;
import parquet.schema.OriginalType;
import parquet.schema.PrimitiveType;
import parquet.schema.Type;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * Schema Utilities
 */
public class SchemaUtil {
    private static Logger LOG = LoggerFactory.getLogger(SchemaUtil.class);

    /**
     * Get the schema difference between the storage schema and hive table schema
     *
     * @param storageSchema
     * @param tableSchema
     * @param partitionKeys
     * @return
     */
    public static SchemaDifference getSchemaDifference(MessageType storageSchema, Map<String, String> tableSchema,
            String[] partitionKeys) {
        Map<String, String> newTableSchema;
        try {
            newTableSchema = convertParquetSchemaToHiveSchema(storageSchema);
        } catch (IOException e) {
            throw new HoodieHiveDatasetException("Failed to convert parquet schema to hive schema", e);
        }
        LOG.info("Getting schema difference for " + tableSchema + "\r\n\r\n" + newTableSchema);
        SchemaDifference.Builder schemaDiffBuilder = SchemaDifference.newBuilder(storageSchema, tableSchema);
        Set<String> tableColumns = Sets.newHashSet();

        for (Map.Entry<String, String> field : tableSchema.entrySet()) {
            String fieldName = field.getKey().toLowerCase();
            String tickSurroundedFieldName = tickSurround(fieldName);
            if (!isFieldExistsInSchema(newTableSchema, tickSurroundedFieldName)
                    && !ArrayUtils.contains(partitionKeys, fieldName)) {
                schemaDiffBuilder.deleteTableColumn(fieldName);
            } else {
                // check type
                String tableColumnType = field.getValue();
                if (!isFieldExistsInSchema(newTableSchema, tickSurroundedFieldName)) {
                    if (ArrayUtils.contains(partitionKeys, fieldName)) {
                        // Partition key does not have to be part of the storage schema
                        continue;
                    }
                    // We will log this and continue. Hive schema is a superset of all parquet schemas
                    LOG.warn("Ignoring table column " + fieldName + " as its not present in the parquet schema");
                    continue;
                }
                tableColumnType = tableColumnType.replaceAll("\\s+", "");

                String expectedType = getExpectedType(newTableSchema, tickSurroundedFieldName);
                expectedType = expectedType.replaceAll("\\s+", "");
                expectedType = expectedType.replaceAll("`", "");

                if (!tableColumnType.equalsIgnoreCase(expectedType)) {
                    // check for incremental datasets, the schema type change is allowed as per evolution rules
                    if (!isSchemaTypeUpdateAllowed(tableColumnType, expectedType)) {
                        throw new HoodieHiveDatasetException("Could not convert field Type from " + tableColumnType
                                + " to " + expectedType + " for field " + fieldName);
                    }
                    schemaDiffBuilder.updateTableColumn(fieldName,
                            getExpectedType(newTableSchema, tickSurroundedFieldName));
                }
            }
            tableColumns.add(tickSurroundedFieldName);
        }

        for (Map.Entry<String, String> entry : newTableSchema.entrySet()) {
            if (!tableColumns.contains(entry.getKey().toLowerCase())) {
                schemaDiffBuilder.addTableColumn(entry.getKey(), entry.getValue());
            }
        }
        LOG.info("Difference between schemas: " + schemaDiffBuilder.build().toString());

        return schemaDiffBuilder.build();
    }

    private static String getExpectedType(Map<String, String> newTableSchema, String fieldName) {
        for (Map.Entry<String, String> entry : newTableSchema.entrySet()) {
            if (entry.getKey().toLowerCase().equals(fieldName)) {
                return entry.getValue();
            }
        }
        return null;
    }

    private static boolean isFieldExistsInSchema(Map<String, String> newTableSchema, String fieldName) {
        for (String entry : newTableSchema.keySet()) {
            if (entry.toLowerCase().equals(fieldName)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Returns equivalent Hive table schema read from a parquet file
     *
     * @param messageType : Parquet Schema
     * @return : Hive Table schema read from parquet file MAP[String,String]
     * @throws IOException
     */
    public static Map<String, String> convertParquetSchemaToHiveSchema(MessageType messageType) throws IOException {
        Map<String, String> schema = Maps.newLinkedHashMap();
        List<Type> parquetFields = messageType.getFields();
        for (Type parquetType : parquetFields) {
            StringBuilder result = new StringBuilder();
            String key = parquetType.getName();
            if (parquetType.isRepetition(Type.Repetition.REPEATED)) {
                result.append(createHiveArray(parquetType, ""));
            } else {
                result.append(convertField(parquetType));
            }

            schema.put(hiveCompatibleFieldName(key, false), result.toString());
        }
        return schema;
    }

    /**
     * Convert one field data type of parquet schema into an equivalent Hive
     * schema
     *
     * @param parquetType : Single paruet field
     * @return : Equivalent sHive schema
     */
    private static String convertField(final Type parquetType) {
        StringBuilder field = new StringBuilder();
        if (parquetType.isPrimitive()) {
            final PrimitiveType.PrimitiveTypeName parquetPrimitiveTypeName = parquetType.asPrimitiveType()
                    .getPrimitiveTypeName();
            final OriginalType originalType = parquetType.getOriginalType();
            if (originalType == OriginalType.DECIMAL) {
                final DecimalMetadata decimalMetadata = parquetType.asPrimitiveType().getDecimalMetadata();
                return field.append("DECIMAL(").append(decimalMetadata.getPrecision()).append(" , ")
                        .append(decimalMetadata.getScale()).append(")").toString();
            }
            // TODO - fix the method naming here
            return parquetPrimitiveTypeName
                    .convert(new PrimitiveType.PrimitiveTypeNameConverter<String, RuntimeException>() {
                        @Override
                        public String convertBOOLEAN(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
                            return "boolean";
                        }

                        @Override
                        public String convertINT32(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
                            return "int";
                        }

                        @Override
                        public String convertINT64(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
                            return "bigint";
                        }

                        @Override
                        public String convertINT96(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
                            return "timestamp-millis";
                        }

                        @Override
                        public String convertFLOAT(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
                            return "float";
                        }

                        @Override
                        public String convertDOUBLE(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
                            return "double";
                        }

                        @Override
                        public String convertFIXED_LEN_BYTE_ARRAY(
                                PrimitiveType.PrimitiveTypeName primitiveTypeName) {
                            return "binary";
                        }

                        @Override
                        public String convertBINARY(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
                            if (originalType == OriginalType.UTF8 || originalType == OriginalType.ENUM) {
                                return "string";
                            } else {
                                return "binary";
                            }
                        }
                    });
        } else {
            GroupType parquetGroupType = parquetType.asGroupType();
            OriginalType originalType = parquetGroupType.getOriginalType();
            if (originalType != null) {
                switch (originalType) {
                case LIST:
                    if (parquetGroupType.getFieldCount() != 1) {
                        throw new UnsupportedOperationException("Invalid list type " + parquetGroupType);
                    }
                    Type elementType = parquetGroupType.getType(0);
                    if (!elementType.isRepetition(Type.Repetition.REPEATED)) {
                        throw new UnsupportedOperationException("Invalid list type " + parquetGroupType);
                    }
                    return createHiveArray(elementType, parquetGroupType.getName());
                case MAP:
                    if (parquetGroupType.getFieldCount() != 1 || parquetGroupType.getType(0).isPrimitive()) {
                        throw new UnsupportedOperationException("Invalid map type " + parquetGroupType);
                    }
                    GroupType mapKeyValType = parquetGroupType.getType(0).asGroupType();
                    if (!mapKeyValType.isRepetition(Type.Repetition.REPEATED)
                            || !mapKeyValType.getOriginalType().equals(OriginalType.MAP_KEY_VALUE)
                            || mapKeyValType.getFieldCount() != 2) {
                        throw new UnsupportedOperationException("Invalid map type " + parquetGroupType);
                    }
                    Type keyType = mapKeyValType.getType(0);
                    if (!keyType.isPrimitive()
                            || !keyType.asPrimitiveType().getPrimitiveTypeName()
                                    .equals(PrimitiveType.PrimitiveTypeName.BINARY)
                            || !keyType.getOriginalType().equals(OriginalType.UTF8)) {
                        throw new UnsupportedOperationException("Map key type must be binary (UTF8): " + keyType);
                    }
                    Type valueType = mapKeyValType.getType(1);
                    return createHiveMap(convertField(keyType), convertField(valueType));
                case ENUM:
                case UTF8:
                    return "string";
                case MAP_KEY_VALUE:
                    // MAP_KEY_VALUE was supposed to be used to annotate key and
                    // value group levels in a
                    // MAP. However, that is always implied by the structure of
                    // MAP. Hence, PARQUET-113
                    // dropped the requirement for having MAP_KEY_VALUE.
                default:
                    throw new UnsupportedOperationException("Cannot convert Parquet type " + parquetType);
                }
            } else {
                // if no original type then it's a record
                return createHiveStruct(parquetGroupType.getFields());
            }
        }
    }

    /**
     * Return a 'struct' Hive schema from a list of Parquet fields
     *
     * @param parquetFields : list of parquet fields
     * @return : Equivalent 'struct' Hive schema
     */
    private static String createHiveStruct(List<Type> parquetFields) {
        StringBuilder struct = new StringBuilder();
        struct.append("STRUCT< ");
        for (Type field : parquetFields) {
            //TODO: struct field name is only translated to support special char($)
            //We will need to extend it to other collection type
            struct.append(hiveCompatibleFieldName(field.getName(), true)).append(" : ");
            struct.append(convertField(field)).append(", ");
        }
        struct.delete(struct.length() - 2, struct.length()); // Remove the last
        // ", "
        struct.append(">");
        String finalStr = struct.toString();
        // Struct cannot have - in them. userstore_udr_entities has uuid in struct. This breaks the schema.
        // HDrone sync should not fail because of this.
        finalStr = finalStr.replaceAll("-", "_");
        return finalStr;
    }

    private static String hiveCompatibleFieldName(String fieldName, boolean isNested) {
        String result = fieldName;
        if (isNested) {
            result = ColumnNameXLator.translateNestedColumn(fieldName);
        }
        return tickSurround(result);
    }

    private static String tickSurround(String result) {
        if (!result.startsWith("`")) {
            result = "`" + result;
        }
        if (!result.endsWith("`")) {
            result = result + "`";
        }
        return result;
    }

    /**
     * Create a 'Map' schema from Parquet map field
     *
     * @param keyType
     * @param valueType
     * @return
     */
    private static String createHiveMap(String keyType, String valueType) {
        return "MAP< " + keyType + ", " + valueType + ">";
    }

    /**
     * Create an Array Hive schema from equivalent parquet list type
     *
     * @param elementType
     * @param elementName
     * @return
     */
    private static String createHiveArray(Type elementType, String elementName) {
        StringBuilder array = new StringBuilder();
        array.append("ARRAY< ");
        if (elementType.isPrimitive()) {
            array.append(convertField(elementType));
        } else {
            final GroupType groupType = elementType.asGroupType();
            final List<Type> groupFields = groupType.getFields();
            if (groupFields.size() > 1 || (groupFields.size() == 1 && (elementType.getName().equals("array")
                    || elementType.getName().equals(elementName + "_tuple")))) {
                array.append(convertField(elementType));
            } else {
                array.append(convertField(groupType.getFields().get(0)));
            }
        }
        array.append(">");
        return array.toString();
    }

    public static boolean isSchemaTypeUpdateAllowed(String prevType, String newType) {
        if (prevType == null || prevType.trim().isEmpty() || newType == null || newType.trim().isEmpty()) {
            return false;
        }
        prevType = prevType.toLowerCase();
        newType = newType.toLowerCase();
        if (prevType.equals(newType)) {
            return true;
        } else if (prevType.equalsIgnoreCase("int") && newType.equalsIgnoreCase("bigint")) {
            return true;
        } else if (prevType.equalsIgnoreCase("float") && newType.equalsIgnoreCase("double")) {
            return true;
        } else if (prevType.contains("struct") && newType.toLowerCase().contains("struct")) {
            return true;
        }
        return false;
    }

    public static String generateSchemaString(MessageType storageSchema) throws IOException {
        Map<String, String> hiveSchema = convertParquetSchemaToHiveSchema(storageSchema);
        StringBuilder columns = new StringBuilder();
        for (Map.Entry<String, String> hiveSchemaEntry : hiveSchema.entrySet()) {
            columns.append(hiveSchemaEntry.getKey()).append(" ");
            columns.append(hiveSchemaEntry.getValue()).append(", ");
        }
        // Remove the last ", "
        columns.delete(columns.length() - 2, columns.length());
        return columns.toString();
    }

    public static String generateCreateDDL(MessageType storageSchema, HoodieDatasetReference metadata,
            String[] partitionKeys, String inputFormatClass, String outputFormatClass) throws IOException {
        Map<String, String> hiveSchema = convertParquetSchemaToHiveSchema(storageSchema);
        String columns = generateSchemaString(storageSchema);

        StringBuilder partitionFields = new StringBuilder();
        for (String partitionKey : partitionKeys) {
            partitionFields.append(partitionKey).append(" ").append(getPartitionKeyType(hiveSchema, partitionKey));
        }

        StringBuilder sb = new StringBuilder("CREATE EXTERNAL TABLE  IF NOT EXISTS ");
        sb = sb.append(metadata.getDatabaseTableName());
        sb = sb.append("( ").append(columns).append(")");
        if (partitionKeys.length > 0) {
            sb = sb.append(" PARTITIONED BY (").append(partitionFields).append(")");
        }
        sb = sb.append(" ROW FORMAT SERDE '").append(ParquetHiveSerDe.class.getName()).append("'");
        sb = sb.append(" STORED AS INPUTFORMAT '").append(inputFormatClass).append("'");
        sb = sb.append(" OUTPUTFORMAT '").append(outputFormatClass).append("' LOCATION '")
                .append(metadata.getBaseDatasetPath()).append("'");
        return sb.toString();
    }

    private static String getPartitionKeyType(Map<String, String> hiveSchema, String partitionKey) {
        if (hiveSchema.containsKey(partitionKey)) {
            return hiveSchema.get(partitionKey);
        }
        // Default the unknown partition fields to be String
        // TODO - all partition fields should be part of the schema. datestr is treated as special. Dont do that
        return "String";
    }
}