com.uber.hoodie.hive.HiveSyncTool.java Source code

Introduction

Here is the source code for com.uber.hoodie.hive.HiveSyncTool.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie.hive;

import com.beust.jcommander.JCommander;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.exception.InvalidDatasetException;
import com.uber.hoodie.hadoop.HoodieInputFormat;
import com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat;
import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent;
import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent.PartitionEventType;
import com.uber.hoodie.hive.util.SchemaUtil;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import parquet.schema.MessageType;

/**
 * Tool to sync a hoodie HDFS dataset with a hive metastore table. Either use it as a api
 * HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive.jar
 * HiveSyncTool [args]
 * <p>
 * This utility will get the schema from the latest commit and will sync hive table schema Also this
 * will sync the partitions incrementally (all the partitions modified since the last commit)
 */
@SuppressWarnings("WeakerAccess")
public class HiveSyncTool {

    private static final Logger LOG = LoggerFactory.getLogger(HiveSyncTool.class);
    private final HoodieHiveClient hoodieHiveClient;
    public static final String SUFFIX_REALTIME_TABLE = "_rt";
    private final HiveSyncConfig cfg;

    public HiveSyncTool(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
        this.hoodieHiveClient = new HoodieHiveClient(cfg, configuration, fs);
        this.cfg = cfg;
    }

    public void syncHoodieTable() {
        switch (hoodieHiveClient.getTableType()) {
        case COPY_ON_WRITE:
            syncHoodieTable(false);
            break;
        case MERGE_ON_READ:
            //sync a RO table for MOR
            syncHoodieTable(false);
            String originalTableName = cfg.tableName;
            //TODO : Make realtime table registration optional using a config param
            cfg.tableName = cfg.tableName + SUFFIX_REALTIME_TABLE;
            //sync a RT table for MOR
            syncHoodieTable(true);
            cfg.tableName = originalTableName;
            break;
        default:
            LOG.error("Unknown table type " + hoodieHiveClient.getTableType());
            throw new InvalidDatasetException(hoodieHiveClient.getBasePath());
        }
        hoodieHiveClient.close();
    }

    private void syncHoodieTable(boolean isRealTime) {
        LOG.info("Trying to sync hoodie table " + cfg.tableName + " with base path "
                + hoodieHiveClient.getBasePath() + " of type " + hoodieHiveClient.getTableType());

        // Check if the necessary table exists
        boolean tableExists = hoodieHiveClient.doesTableExist();
        // Get the parquet schema for this dataset looking at the latest commit
        MessageType schema = hoodieHiveClient.getDataSchema();
        // Sync schema if needed
        syncSchema(tableExists, isRealTime, schema);

        LOG.info("Schema sync complete. Syncing partitions for " + cfg.tableName);
        // Get the last time we successfully synced partitions
        Optional<String> lastCommitTimeSynced = Optional.empty();
        if (tableExists) {
            lastCommitTimeSynced = hoodieHiveClient.getLastCommitTimeSynced();
        }
        LOG.info("Last commit time synced was found to be " + lastCommitTimeSynced.orElse("null"));
        List<String> writtenPartitionsSince = hoodieHiveClient.getPartitionsWrittenToSince(lastCommitTimeSynced);
        LOG.info("Storage partitions scan complete. Found " + writtenPartitionsSince.size());
        // Sync the partitions if needed
        syncPartitions(writtenPartitionsSince);

        hoodieHiveClient.updateLastCommitTimeSynced();
        LOG.info("Sync complete for " + cfg.tableName);
    }

    /**
     * Get the latest schema from the last commit and check if its in sync with the hive table schema.
     * If not, evolves the table schema.
     *
     * @param tableExists - does table exist
     * @param schema      - extracted schema
     */
    private void syncSchema(boolean tableExists, boolean isRealTime, MessageType schema) {
        // Check and sync schema
        if (!tableExists) {
            LOG.info("Table " + cfg.tableName + " is not found. Creating it");
            if (!isRealTime) {
                // TODO - RO Table for MOR only after major compaction (UnboundedCompaction is default
                // for now)
                hoodieHiveClient.createTable(schema, HoodieInputFormat.class.getName(),
                        MapredParquetOutputFormat.class.getName(), ParquetHiveSerDe.class.getName());
            } else {
                // Custom serde will not work with ALTER TABLE REPLACE COLUMNS
                // https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive
                // /ql/exec/DDLTask.java#L3488
                hoodieHiveClient.createTable(schema, HoodieRealtimeInputFormat.class.getName(),
                        MapredParquetOutputFormat.class.getName(), ParquetHiveSerDe.class.getName());
            }
        } else {
            // Check if the dataset schema has evolved
            Map<String, String> tableSchema = hoodieHiveClient.getTableSchema();
            SchemaDifference schemaDiff = SchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields);
            if (!schemaDiff.isEmpty()) {
                LOG.info("Schema difference found for " + cfg.tableName);
                hoodieHiveClient.updateTableDefinition(schema);
            } else {
                LOG.info("No Schema difference for " + cfg.tableName);
            }
        }
    }

    /**
     * Syncs the list of storage parititions passed in (checks if the partition is in hive, if not
     * adds it or if the partition path does not match, it updates the partition path)
     */
    private void syncPartitions(List<String> writtenPartitionsSince) {
        try {
            List<Partition> hivePartitions = hoodieHiveClient.scanTablePartitions();
            List<PartitionEvent> partitionEvents = hoodieHiveClient.getPartitionEvents(hivePartitions,
                    writtenPartitionsSince);
            List<String> newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD);
            LOG.info("New Partitions " + newPartitions);
            hoodieHiveClient.addPartitionsToTable(newPartitions);
            List<String> updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE);
            LOG.info("Changed Partitions " + updatePartitions);
            hoodieHiveClient.updatePartitionsToTable(updatePartitions);
        } catch (Exception e) {
            throw new HoodieHiveSyncException("Failed to sync partitions for table " + cfg.tableName, e);
        }
    }

    private List<String> filterPartitions(List<PartitionEvent> events, PartitionEventType eventType) {
        return events.stream().filter(s -> s.eventType == eventType).map(s -> s.storagePartition)
                .collect(Collectors.toList());
    }

    public static void main(String[] args) throws Exception {
        // parse the params
        final HiveSyncConfig cfg = new HiveSyncConfig();
        JCommander cmd = new JCommander(cfg, args);
        if (cfg.help || args.length == 0) {
            cmd.usage();
            System.exit(1);
        }
        FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration());
        HiveConf hiveConf = new HiveConf();
        hiveConf.addResource(fs.getConf());
        new HiveSyncTool(cfg, hiveConf, fs).syncHoodieTable();
    }
}