gobblin.hive.metastore.HiveMetaStoreUtils.java Source code

Java tutorial

Introduction

Here is the source code for gobblin.hive.metastore.HiveMetaStoreUtils.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.hive.metastore;

import com.google.common.base.Splitter;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.avro.SchemaParseException;
import org.apache.commons.lang.reflect.MethodUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.TableType;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.SerDeInfo;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.avro.AvroSerDe;
import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils;
import org.apache.hadoop.util.ReflectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.primitives.Ints;

import gobblin.annotation.Alpha;
import gobblin.configuration.State;
import gobblin.hive.HiveConstants;
import gobblin.hive.HivePartition;
import gobblin.hive.HiveRegistrationUnit;
import gobblin.hive.HiveRegistrationUnit.Column;
import gobblin.hive.HiveTable;

/**
 * A utility class for converting Hive's {@link Table} and {@link Partition} objects into Gobblin's
 * {@link HiveTable} and {@link HivePartition} objects, and vice versa.
 *
 * @author Ziyang Liu
 */
@Alpha
public class HiveMetaStoreUtils {

    private static final Logger LOG = LoggerFactory.getLogger(HiveMetaStoreUtils.class);

    private static final TableType DEFAULT_TABLE_TYPE = TableType.EXTERNAL_TABLE;
    private static final Splitter LIST_SPLITTER_COMMA = Splitter.on(",").trimResults().omitEmptyStrings();
    private static final Splitter LIST_SPLITTER_COLON = Splitter.on(":").trimResults().omitEmptyStrings();
    private static final String EXTERNAL = "EXTERNAL";
    public static final String RUNTIME_PROPS = "runtime.props";

    private HiveMetaStoreUtils() {
    }

    /**
     * Convert a {@link HiveTable} into a {@link Table}.
     */
    public static Table getTable(HiveTable hiveTable) {
        State props = hiveTable.getProps();
        Table table = new Table();
        table.setDbName(hiveTable.getDbName());
        table.setTableName(hiveTable.getTableName());
        table.setParameters(getParameters(props));
        if (hiveTable.getCreateTime().isPresent()) {
            table.setCreateTime(Ints.checkedCast(hiveTable.getCreateTime().get()));
        }
        if (hiveTable.getLastAccessTime().isPresent()) {
            table.setLastAccessTime(Ints.checkedCast(hiveTable.getLastAccessTime().get()));
        }
        if (hiveTable.getOwner().isPresent()) {
            table.setOwner(hiveTable.getOwner().get());
        }
        if (hiveTable.getRetention().isPresent()) {
            table.setRetention(Ints.checkedCast(hiveTable.getRetention().get()));
        }
        if (hiveTable.getTableType().isPresent()) {
            table.setTableType(hiveTable.getTableType().get());
        } else {
            table.setTableType(DEFAULT_TABLE_TYPE.toString());
        }
        if (table.getTableType().equals(TableType.EXTERNAL_TABLE.toString())) {
            table.getParameters().put(EXTERNAL, Boolean.TRUE.toString().toUpperCase());
        }
        table.setPartitionKeys(getFieldSchemas(hiveTable.getPartitionKeys()));
        table.setSd(getStorageDescriptor(hiveTable));
        return table;
    }

    /**
     * Convert a {@link Table} into a {@link HiveTable}.
     */
    public static HiveTable getHiveTable(Table table) {
        State tableProps = getTableProps(table);
        State storageProps = getStorageProps(table.getSd());
        State serDeProps = getSerDeProps(table.getSd().getSerdeInfo());
        HiveTable hiveTable = new HiveTable.Builder().withDbName(table.getDbName())
                .withTableName(table.getTableName()).withPartitionKeys(getColumns(table.getPartitionKeys()))
                .withProps(tableProps).withStorageProps(storageProps).withSerdeProps(serDeProps).build();
        if (table.getCreateTime() > 0) {
            hiveTable.setCreateTime(table.getCreateTime());
        }
        if (table.getSd().getCols() != null) {
            hiveTable.setColumns(getColumns(table.getSd().getCols()));
        }
        if (table.getSd().getBucketCols() != null) {
            hiveTable.setBucketColumns(table.getSd().getBucketCols());
        }
        return hiveTable;
    }

    /**
     * Convert a {@link HivePartition} into a {@link Partition}.
     */
    public static Partition getPartition(HivePartition hivePartition) {
        State props = hivePartition.getProps();
        Partition partition = new Partition();
        partition.setDbName(hivePartition.getDbName());
        partition.setTableName(hivePartition.getTableName());
        partition.setValues(hivePartition.getValues());
        partition.setParameters(getParameters(props));
        if (hivePartition.getCreateTime().isPresent()) {
            partition.setCreateTime(Ints.checkedCast(hivePartition.getCreateTime().get()));
        } else if (props.contains(HiveConstants.CREATE_TIME)) {
            partition.setCreateTime(props.getPropAsInt(HiveConstants.CREATE_TIME));
        }
        if (props.contains(HiveConstants.LAST_ACCESS_TIME)) {
            partition.setLastAccessTime(props.getPropAsInt(HiveConstants.LAST_ACCESS_TIME));
        }
        partition.setSd(getStorageDescriptor(hivePartition));
        return partition;
    }

    /**
     * Convert a {@link Partition} into a {@link HivePartition}.
     */
    public static HivePartition getHivePartition(Partition partition) {
        State partitionProps = getPartitionProps(partition);
        State storageProps = getStorageProps(partition.getSd());
        State serDeProps = getSerDeProps(partition.getSd().getSerdeInfo());
        HivePartition hivePartition = new HivePartition.Builder().withDbName(partition.getDbName())
                .withTableName(partition.getTableName()).withPartitionValues(partition.getValues())
                .withProps(partitionProps).withStorageProps(storageProps).withSerdeProps(serDeProps).build();
        if (partition.getCreateTime() > 0) {
            hivePartition.setCreateTime(partition.getCreateTime());
        }
        if (partition.getSd().getCols() != null) {
            hivePartition.setColumns(getColumns(partition.getSd().getCols()));
        }
        if (partition.getSd().getBucketCols() != null) {
            hivePartition.setBucketColumns(partition.getSd().getBucketCols());
        }
        return hivePartition;
    }

    private static Map<String, String> getParameters(State props) {
        Map<String, String> parameters = Maps.newHashMap();
        if (props.contains(RUNTIME_PROPS)) {
            String runtimePropsString = props.getProp(RUNTIME_PROPS);
            for (String propValue : LIST_SPLITTER_COMMA.splitToList(runtimePropsString)) {
                List<String> tokens = LIST_SPLITTER_COLON.splitToList(propValue);
                Preconditions.checkState(tokens.size() == 2,
                        propValue + " is not a valid Hive table/partition property");
                parameters.put(tokens.get(0), tokens.get(1));
            }
        }
        for (String propKey : props.getPropertyNames()) {
            if (!propKey.equals(RUNTIME_PROPS)) {
                parameters.put(propKey, props.getProp(propKey));
            }
        }
        return parameters;
    }

    private static StorageDescriptor getStorageDescriptor(HiveRegistrationUnit unit) {
        State props = unit.getStorageProps();
        StorageDescriptor sd = new StorageDescriptor();
        sd.setParameters(getParameters(props));
        sd.setCols(getFieldSchemas(unit));
        if (unit.getLocation().isPresent()) {
            sd.setLocation(unit.getLocation().get());
        }
        if (unit.getInputFormat().isPresent()) {
            sd.setInputFormat(unit.getInputFormat().get());
        }
        if (unit.getOutputFormat().isPresent()) {
            sd.setOutputFormat(unit.getOutputFormat().get());
        }
        if (unit.getIsCompressed().isPresent()) {
            sd.setCompressed(unit.getIsCompressed().get());
        }
        if (unit.getNumBuckets().isPresent()) {
            sd.setNumBuckets(unit.getNumBuckets().get());
        }
        if (unit.getBucketColumns().isPresent()) {
            sd.setBucketCols(unit.getBucketColumns().get());
        }
        if (unit.getIsStoredAsSubDirs().isPresent()) {
            sd.setStoredAsSubDirectories(unit.getIsStoredAsSubDirs().get());
        }
        sd.setSerdeInfo(getSerDeInfo(unit));
        return sd;
    }

    private static SerDeInfo getSerDeInfo(HiveRegistrationUnit unit) {
        State props = unit.getSerDeProps();
        SerDeInfo si = new SerDeInfo();
        si.setParameters(getParameters(props));
        si.setName(unit.getTableName());
        if (unit.getSerDeType().isPresent()) {
            si.setSerializationLib(unit.getSerDeType().get());
        }
        return si;
    }

    private static State getTableProps(Table table) {
        State tableProps = new State();
        for (Map.Entry<String, String> entry : table.getParameters().entrySet()) {
            tableProps.setProp(entry.getKey(), entry.getValue());
        }
        if (table.isSetCreateTime()) {
            tableProps.setProp(HiveConstants.CREATE_TIME, table.getCreateTime());
        }
        if (table.isSetLastAccessTime()) {
            tableProps.setProp(HiveConstants.LAST_ACCESS_TIME, table.getCreateTime());
        }
        if (table.isSetOwner()) {
            tableProps.setProp(HiveConstants.OWNER, table.getOwner());
        }
        if (table.isSetTableType()) {
            tableProps.setProp(HiveConstants.TABLE_TYPE, table.getTableType());
        }
        if (table.isSetRetention()) {
            tableProps.setProp(HiveConstants.RETENTION, table.getRetention());
        }
        return tableProps;
    }

    private static State getPartitionProps(Partition partition) {
        State partitionProps = new State();
        for (Map.Entry<String, String> entry : partition.getParameters().entrySet()) {
            partitionProps.setProp(entry.getKey(), entry.getValue());
        }
        if (partition.isSetCreateTime()) {
            partitionProps.setProp(HiveConstants.CREATE_TIME, partition.getCreateTime());
        }
        if (partition.isSetLastAccessTime()) {
            partitionProps.setProp(HiveConstants.LAST_ACCESS_TIME, partition.getCreateTime());
        }
        return partitionProps;
    }

    private static State getStorageProps(StorageDescriptor sd) {
        State storageProps = new State();
        for (Map.Entry<String, String> entry : sd.getParameters().entrySet()) {
            storageProps.setProp(entry.getKey(), entry.getValue());
        }
        if (sd.isSetLocation()) {
            storageProps.setProp(HiveConstants.LOCATION, sd.getLocation());
        }
        if (sd.isSetInputFormat()) {
            storageProps.setProp(HiveConstants.INPUT_FORMAT, sd.getInputFormat());
        }
        if (sd.isSetOutputFormat()) {
            storageProps.setProp(HiveConstants.OUTPUT_FORMAT, sd.getOutputFormat());
        }
        if (sd.isSetCompressed()) {
            storageProps.setProp(HiveConstants.COMPRESSED, sd.isCompressed());
        }
        if (sd.isSetNumBuckets()) {
            storageProps.setProp(HiveConstants.NUM_BUCKETS, sd.getNumBuckets());
        }
        if (sd.isSetBucketCols()) {
            for (String bucketColumn : sd.getBucketCols()) {
                storageProps.appendToListProp(HiveConstants.BUCKET_COLUMNS, bucketColumn);
            }
        }
        if (sd.isSetStoredAsSubDirectories()) {
            storageProps.setProp(HiveConstants.STORED_AS_SUB_DIRS, sd.isStoredAsSubDirectories());
        }
        return storageProps;
    }

    private static State getSerDeProps(SerDeInfo si) {
        State serDeProps = new State();
        for (Map.Entry<String, String> entry : si.getParameters().entrySet()) {
            serDeProps.setProp(entry.getKey(), entry.getValue());
        }
        if (si.isSetSerializationLib()) {
            serDeProps.setProp(HiveConstants.SERDE_TYPE, si.getSerializationLib());
        }
        return serDeProps;
    }

    private static List<Column> getColumns(List<FieldSchema> fieldSchemas) {
        List<Column> columns = Lists.newArrayListWithCapacity(fieldSchemas.size());
        for (FieldSchema fieldSchema : fieldSchemas) {
            columns.add(new Column(fieldSchema.getName(), fieldSchema.getType(), fieldSchema.getComment()));
        }
        return columns;
    }

    private static List<FieldSchema> getFieldSchemas(List<Column> columns) {
        List<FieldSchema> fieldSchemas = Lists.newArrayListWithCapacity(columns.size());
        for (Column column : columns) {
            fieldSchemas.add(new FieldSchema(column.getName(), column.getType(), column.getComment()));
        }
        return fieldSchemas;
    }

    /**
     * First tries getting the {@code FieldSchema}s from the {@code HiveRegistrationUnit}'s columns, if set.
     * Else, gets the {@code FieldSchema}s from the deserializer.
     */
    private static List<FieldSchema> getFieldSchemas(HiveRegistrationUnit unit) {
        List<Column> columns = unit.getColumns();
        List<FieldSchema> fieldSchemas = new ArrayList<>();
        if (columns != null && columns.size() > 0) {
            fieldSchemas = getFieldSchemas(columns);
        } else {
            Deserializer deserializer = getDeserializer(unit);
            if (deserializer != null) {
                try {
                    fieldSchemas = MetaStoreUtils.getFieldsFromDeserializer(unit.getTableName(), deserializer);
                } catch (SerDeException | MetaException e) {
                    LOG.warn("Encountered exception while getting fields from deserializer.", e);
                }
            }
        }
        return fieldSchemas;
    }

    /**
     * Returns a Deserializer from HiveRegistrationUnit if present and successfully initialized. Else returns null.
     */
    private static Deserializer getDeserializer(HiveRegistrationUnit unit) {
        Optional<String> serdeClass = unit.getSerDeType();
        if (!serdeClass.isPresent()) {
            return null;
        }

        String serde = serdeClass.get();
        HiveConf hiveConf = new HiveConf();

        Deserializer deserializer;
        try {
            deserializer = ReflectionUtils
                    .newInstance(hiveConf.getClassByName(serde).asSubclass(Deserializer.class), hiveConf);
        } catch (ClassNotFoundException e) {
            LOG.warn("Serde class " + serde + " not found!", e);
            return null;
        }

        Properties props = new Properties();
        props.putAll(unit.getProps().getProperties());
        props.putAll(unit.getStorageProps().getProperties());
        props.putAll(unit.getSerDeProps().getProperties());

        try {
            SerDeUtils.initializeSerDe(deserializer, hiveConf, props, null);

            // Temporary check that's needed until Gobblin is upgraded to Hive 1.1.0+, which includes the improved error
            // handling in AvroSerDe added in HIVE-7868.
            if (deserializer instanceof AvroSerDe) {
                try {
                    inVokeDetermineSchemaOrThrowExceptionMethod(props, new Configuration());
                } catch (SchemaParseException | InvocationTargetException | NoSuchMethodException
                        | IllegalAccessException e) {
                    LOG.warn("Failed to initialize AvroSerDe.");
                    throw new SerDeException(e);
                }
            }
        } catch (SerDeException e) {
            LOG.warn("Failed to initialize serde " + serde + " with properties " + props + " for table "
                    + unit.getDbName() + "." + unit.getTableName());
            return null;
        }

        return deserializer;
    }

    @VisibleForTesting
    protected static void inVokeDetermineSchemaOrThrowExceptionMethod(Properties props, Configuration conf)
            throws NoSuchMethodException, IllegalAccessException, InvocationTargetException {
        String methodName = "determineSchemaOrThrowException";
        Method method = MethodUtils.getAccessibleMethod(AvroSerdeUtils.class, methodName, Properties.class);
        boolean withConf = false;
        if (method == null) {
            method = MethodUtils.getAccessibleMethod(AvroSerdeUtils.class, methodName,
                    new Class[] { Configuration.class, Properties.class });
            withConf = true;
        }
        Preconditions.checkNotNull(method, "Cannot find matching " + methodName);
        if (!withConf) {
            MethodUtils.invokeStaticMethod(AvroSerdeUtils.class, methodName, props);
        } else {
            MethodUtils.invokeStaticMethod(AvroSerdeUtils.class, methodName, new Object[] { conf, props });
        }
    }
}