gobblin.data.management.conversion.hive.publisher.HiveConvertPublisher.java Source code

Java tutorial

Introduction

Here is the source code for gobblin.data.management.conversion.hive.publisher.HiveConvertPublisher.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package gobblin.data.management.conversion.hive.publisher;

import java.io.IOException;
import java.net.URI;
import java.sql.SQLException;
import java.util.Collection;
import java.util.List;
import java.util.Map;

import java.util.Set;
import java.util.regex.Pattern;

import javax.annotation.Nonnull;

import lombok.extern.slf4j.Slf4j;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.thrift.TException;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Optional;
import com.google.common.base.Predicate;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Ordering;
import com.google.common.collect.Sets;

import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;
import gobblin.configuration.WorkUnitState;
import gobblin.configuration.WorkUnitState.WorkingState;
import gobblin.data.management.conversion.hive.avro.AvroSchemaManager;
import gobblin.data.management.conversion.hive.entities.QueryBasedHivePublishEntity;
import gobblin.data.management.conversion.hive.events.EventConstants;
import gobblin.data.management.conversion.hive.events.EventWorkunitUtils;
import gobblin.data.management.conversion.hive.query.HiveAvroORCQueryGenerator;
import gobblin.data.management.conversion.hive.source.HiveSource;
import gobblin.data.management.conversion.hive.source.HiveWorkUnit;
import gobblin.data.management.conversion.hive.watermarker.HiveSourceWatermarker;
import gobblin.data.management.conversion.hive.watermarker.HiveSourceWatermarkerFactory;
import gobblin.data.management.conversion.hive.watermarker.PartitionLevelWatermarker;
import gobblin.data.management.copy.hive.HiveDatasetFinder;
import gobblin.hive.HiveMetastoreClientPool;
import gobblin.util.AutoReturnableObject;
import gobblin.util.HiveJdbcConnector;
import gobblin.instrumented.Instrumented;
import gobblin.metrics.MetricContext;
import gobblin.metrics.event.EventSubmitter;
import gobblin.metrics.event.sla.SlaEventSubmitter;
import gobblin.publisher.DataPublisher;
import gobblin.util.HadoopUtils;
import gobblin.util.WriterUtils;
import gobblin.util.reflection.GobblinConstructorUtils;

/**
 * A simple {@link DataPublisher} updates the watermark and working state
 */
@Slf4j
public class HiveConvertPublisher extends DataPublisher {

    private final AvroSchemaManager avroSchemaManager;
    private final HiveJdbcConnector hiveJdbcConnector;
    private MetricContext metricContext;
    private EventSubmitter eventSubmitter;
    private final FileSystem fs;
    private final HiveSourceWatermarker watermarker;
    private final HiveMetastoreClientPool pool;

    public static final String PARTITION_PARAMETERS_WHITELIST = "hive.conversion.partitionParameters.whitelist";
    public static final String PARTITION_PARAMETERS_BLACKLIST = "hive.conversion.partitionParameters.blacklist";
    public static final String COMPLETE_SOURCE_PARTITION_NAME = "completeSourcePartitionName";
    public static final String COMPLETE_DEST_PARTITION_NAME = "completeDestPartitionName";

    private static final Splitter COMMA_SPLITTER = Splitter.on(",").omitEmptyStrings().trimResults();
    private static final Splitter At_SPLITTER = Splitter.on("@").omitEmptyStrings().trimResults();

    public HiveConvertPublisher(State state) throws IOException {
        super(state);
        this.avroSchemaManager = new AvroSchemaManager(FileSystem.get(HadoopUtils.newConfiguration()), state);
        this.metricContext = Instrumented.getMetricContext(state, HiveConvertPublisher.class);
        this.eventSubmitter = new EventSubmitter.Builder(this.metricContext, EventConstants.CONVERSION_NAMESPACE)
                .build();

        Configuration conf = new Configuration();
        Optional<String> uri = Optional.fromNullable(this.state.getProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI));
        if (uri.isPresent()) {
            this.fs = FileSystem.get(URI.create(uri.get()), conf);
        } else {
            this.fs = FileSystem.get(conf);
        }

        try {
            this.hiveJdbcConnector = HiveJdbcConnector.newConnectorWithProps(state.getProperties());
        } catch (SQLException e) {
            throw new RuntimeException(e);
        }

        this.watermarker = GobblinConstructorUtils.invokeConstructor(HiveSourceWatermarkerFactory.class,
                state.getProp(HiveSource.HIVE_SOURCE_WATERMARKER_FACTORY_CLASS_KEY,
                        HiveSource.DEFAULT_HIVE_SOURCE_WATERMARKER_FACTORY_CLASS))
                .createFromState(state);
        this.pool = HiveMetastoreClientPool.get(state.getProperties(),
                Optional.fromNullable(state.getProperties().getProperty(HiveDatasetFinder.HIVE_METASTORE_URI_KEY)));
    }

    @Override
    public void initialize() throws IOException {
    }

    @Override
    public void publishData(Collection<? extends WorkUnitState> states) throws IOException {

        Set<String> cleanUpQueries = Sets.newLinkedHashSet();
        Set<String> publishQueries = Sets.newLinkedHashSet();
        List<String> directoriesToDelete = Lists.newArrayList();

        try {
            if (Iterables.tryFind(states, UNSUCCESSFUL_WORKUNIT).isPresent()) {
                /////////////////////////////////////////
                // Prepare cleanup and ignore publish
                /////////////////////////////////////////
                for (WorkUnitState wus : states) {
                    QueryBasedHivePublishEntity publishEntity = HiveAvroORCQueryGenerator
                            .deserializePublishCommands(wus);

                    // Add cleanup commands - to be executed later
                    if (publishEntity.getCleanupQueries() != null) {
                        cleanUpQueries.addAll(publishEntity.getCleanupQueries());
                    }

                    if (publishEntity.getCleanupDirectories() != null) {
                        directoriesToDelete.addAll(publishEntity.getCleanupDirectories());
                    }

                    EventWorkunitUtils.setBeginPublishDDLExecuteTimeMetadata(wus, System.currentTimeMillis());
                    wus.setWorkingState(WorkingState.FAILED);
                    if (!wus.getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY)) {
                        try {
                            new SlaEventSubmitter(eventSubmitter, EventConstants.CONVERSION_FAILED_EVENT,
                                    wus.getProperties()).submit();
                        } catch (Exception e) {
                            log.error("Failed while emitting SLA event, but ignoring and moving forward to curate "
                                    + "all clean up comamnds", e);
                        }
                    }
                }
            } else {
                /////////////////////////////////////////
                // Prepare publish and cleanup commands
                /////////////////////////////////////////
                for (WorkUnitState wus : PARTITION_PUBLISH_ORDERING.sortedCopy(states)) {
                    QueryBasedHivePublishEntity publishEntity = HiveAvroORCQueryGenerator
                            .deserializePublishCommands(wus);

                    // Add cleanup commands - to be executed later
                    if (publishEntity.getCleanupQueries() != null) {
                        cleanUpQueries.addAll(publishEntity.getCleanupQueries());
                    }

                    if (publishEntity.getCleanupDirectories() != null) {
                        directoriesToDelete.addAll(publishEntity.getCleanupDirectories());
                    }

                    if (publishEntity.getPublishDirectories() != null) {
                        // Publish snapshot / partition directories
                        Map<String, String> publishDirectories = publishEntity.getPublishDirectories();
                        for (Map.Entry<String, String> publishDir : publishDirectories.entrySet()) {
                            moveDirectory(publishDir.getKey(), publishDir.getValue());
                        }
                    }

                    if (publishEntity.getPublishQueries() != null) {
                        publishQueries.addAll(publishEntity.getPublishQueries());
                    }
                }

                /////////////////////////////////////////
                // Core publish
                /////////////////////////////////////////

                // Update publish start timestamp on all workunits
                for (WorkUnitState wus : PARTITION_PUBLISH_ORDERING.sortedCopy(states)) {
                    if (HiveAvroORCQueryGenerator.deserializePublishCommands(wus).getPublishQueries() != null) {
                        EventWorkunitUtils.setBeginPublishDDLExecuteTimeMetadata(wus, System.currentTimeMillis());
                    }
                }

                // Actual publish: Register snapshot / partition
                executeQueries(Lists.newArrayList(publishQueries));

                // Update publish completion timestamp on all workunits
                for (WorkUnitState wus : PARTITION_PUBLISH_ORDERING.sortedCopy(states)) {
                    if (HiveAvroORCQueryGenerator.deserializePublishCommands(wus).getPublishQueries() != null) {
                        EventWorkunitUtils.setEndPublishDDLExecuteTimeMetadata(wus, System.currentTimeMillis());
                    }

                    wus.setWorkingState(WorkingState.COMMITTED);
                    this.watermarker.setActualHighWatermark(wus);

                    // Emit an SLA event for conversion successful
                    if (!wus.getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY)) {
                        EventWorkunitUtils.setIsFirstPublishMetadata(wus);
                        try {
                            new SlaEventSubmitter(eventSubmitter, EventConstants.CONVERSION_SUCCESSFUL_SLA_EVENT,
                                    wus.getProperties()).submit();
                        } catch (Exception e) {
                            log.error("Failed while emitting SLA event, but ignoring and moving forward to curate "
                                    + "all clean up commands", e);
                        }
                    }
                }
            }
        } finally {
            /////////////////////////////////////////
            // Preserving partition params
            /////////////////////////////////////////
            preservePartitionParams(states);

            /////////////////////////////////////////
            // Post publish cleanup
            /////////////////////////////////////////

            // Execute cleanup commands
            try {
                executeQueries(Lists.newArrayList(cleanUpQueries));
            } catch (Exception e) {
                log.error("Failed to cleanup staging entities in Hive metastore.", e);
            }
            try {
                deleteDirectories(directoriesToDelete);
            } catch (Exception e) {
                log.error("Failed to cleanup staging directories.", e);
            }
        }
    }

    @VisibleForTesting
    public void preservePartitionParams(Collection<? extends WorkUnitState> states) {
        for (WorkUnitState wus : states) {
            if (wus.getWorkingState() != WorkingState.COMMITTED) {
                continue;
            }
            if (!wus.contains(COMPLETE_SOURCE_PARTITION_NAME)) {
                continue;
            }
            if (!wus.contains(COMPLETE_DEST_PARTITION_NAME)) {
                continue;
            }
            if (!(wus.contains(PARTITION_PARAMETERS_WHITELIST) || wus.contains(PARTITION_PARAMETERS_BLACKLIST))) {
                continue;
            }
            List<String> whitelist = COMMA_SPLITTER
                    .splitToList(wus.getProp(PARTITION_PARAMETERS_WHITELIST, StringUtils.EMPTY));
            List<String> blacklist = COMMA_SPLITTER
                    .splitToList(wus.getProp(PARTITION_PARAMETERS_BLACKLIST, StringUtils.EMPTY));
            String completeSourcePartitionName = wus.getProp(COMPLETE_SOURCE_PARTITION_NAME);
            String completeDestPartitionName = wus.getProp(COMPLETE_DEST_PARTITION_NAME);
            if (!copyPartitionParams(completeSourcePartitionName, completeDestPartitionName, whitelist,
                    blacklist)) {
                log.warn("Unable to copy partition parameters from " + completeSourcePartitionName + " to "
                        + completeDestPartitionName);
            }
        }
    }

    /**
     * Method to copy partition parameters from source partition to destination partition
     * @param completeSourcePartitionName dbName@tableName@partitionName
     * @param completeDestPartitionName dbName@tableName@partitionName
     */
    @VisibleForTesting
    public boolean copyPartitionParams(String completeSourcePartitionName, String completeDestPartitionName,
            List<String> whitelist, List<String> blacklist) {
        Optional<Partition> sourcePartitionOptional = getPartitionObject(completeSourcePartitionName);
        Optional<Partition> destPartitionOptional = getPartitionObject(completeDestPartitionName);
        if ((!sourcePartitionOptional.isPresent()) || (!destPartitionOptional.isPresent())) {
            return false;
        }
        Map<String, String> sourceParams = sourcePartitionOptional.get().getParameters();
        Map<String, String> destParams = destPartitionOptional.get().getParameters();

        for (Map.Entry<String, String> param : sourceParams.entrySet()) {
            if (!matched(whitelist, blacklist, param.getKey())) {
                continue;
            }
            destParams.put(param.getKey(), param.getValue());
        }
        destPartitionOptional.get().setParameters(destParams);
        if (!dropPartition(completeDestPartitionName)) {
            return false;
        }
        if (!addPartition(destPartitionOptional.get(), completeDestPartitionName)) {
            return false;
        }
        return true;
    }

    @VisibleForTesting
    public boolean dropPartition(String completePartitionName) {
        List<String> partitionList = At_SPLITTER.splitToList(completePartitionName);
        if (partitionList.size() != 3) {
            log.warn("Invalid partition name " + completePartitionName);
            return false;
        }
        try (AutoReturnableObject<IMetaStoreClient> client = pool.getClient()) {
            client.get().dropPartition(partitionList.get(0), partitionList.get(1), partitionList.get(2), false);
            return true;
        } catch (IOException | TException e) {
            log.warn("Unable to drop Partition " + completePartitionName);
        }
        return false;
    }

    @VisibleForTesting
    public boolean addPartition(Partition destPartition, String completePartitionName) {
        try (AutoReturnableObject<IMetaStoreClient> client = pool.getClient()) {
            client.get().add_partition(destPartition);
            return true;
        } catch (IOException | TException e) {
            log.warn("Unable to add Partition " + completePartitionName);
        }
        return false;
    }

    @VisibleForTesting
    public Optional<Partition> getPartitionObject(String completePartitionName) {
        try (AutoReturnableObject<IMetaStoreClient> client = pool.getClient()) {
            List<String> partitionList = At_SPLITTER.splitToList(completePartitionName);
            if (partitionList.size() != 3) {
                log.warn("Invalid partition name " + completePartitionName);
                return Optional.<Partition>absent();
            }
            Partition sourcePartition = client.get().getPartition(partitionList.get(0), partitionList.get(1),
                    partitionList.get(2));
            return Optional.fromNullable(sourcePartition);
        } catch (IOException | TException e) {
            log.warn("Unable to get partition object from metastore for partition " + completePartitionName);
        }
        return Optional.<Partition>absent();
    }

    @VisibleForTesting
    private boolean matched(List<String> whitelist, List<String> blacklist, String key) {
        for (String patternStr : blacklist) {
            if (Pattern.matches(getRegexPatternString(patternStr), key)) {
                return false;
            }
        }

        for (String patternStr : whitelist) {
            if (Pattern.matches(getRegexPatternString(patternStr), key)) {
                return true;
            }
        }
        return false;
    }

    @VisibleForTesting
    private String getRegexPatternString(String patternStr) {
        patternStr = patternStr.replace("*", ".*");
        StringBuilder builder = new StringBuilder();
        builder.append("\\b").append(patternStr).append("\\b");
        return patternStr;
    }

    private void moveDirectory(String sourceDir, String targetDir) throws IOException {
        // If targetDir exists, delete it
        if (this.fs.exists(new Path(targetDir))) {
            deleteDirectory(targetDir);
        }

        // Create parent directories of targetDir
        WriterUtils.mkdirsWithRecursivePermission(this.fs, new Path(targetDir).getParent(),
                FsPermission.getCachePoolDefault());

        // Move directory
        log.info("Moving directory: " + sourceDir + " to: " + targetDir);
        if (!this.fs.rename(new Path(sourceDir), new Path(targetDir))) {
            throw new IOException(String.format("Unable to move %s to %s", sourceDir, targetDir));
        }
    }

    private void deleteDirectories(List<String> directoriesToDelete) throws IOException {
        for (String directory : directoriesToDelete) {
            deleteDirectory(directory);
        }
    }

    private void deleteDirectory(String dirToDelete) throws IOException {
        if (StringUtils.isBlank(dirToDelete)) {
            return;
        }

        log.info("Going to delete existing partition data: " + dirToDelete);
        this.fs.delete(new Path(dirToDelete), true);
    }

    private void executeQueries(List<String> queries) {
        if (null == queries || queries.size() == 0) {
            return;
        }
        try {
            this.hiveJdbcConnector.executeStatements(queries.toArray(new String[queries.size()]));
        } catch (SQLException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public void publishMetadata(Collection<? extends WorkUnitState> states) throws IOException {
    }

    @Override
    public void close() throws IOException {
        this.avroSchemaManager.cleanupTempSchemas();
        this.hiveJdbcConnector.close();
    }

    private static final Predicate<WorkUnitState> UNSUCCESSFUL_WORKUNIT = new Predicate<WorkUnitState>() {

        @Override
        public boolean apply(WorkUnitState input) {
            return null == input || !WorkingState.SUCCESSFUL.equals(input.getWorkingState());
        }
    };

    /**
     * Publish workunits in lexicographic order of partition names.
     * If a workunit is a noop workunit then {@link HiveWorkUnit#getPartitionName()}
     * would be absent. This can happen while using {@link PartitionLevelWatermarker} where it creates a dummy workunit
     * for all watermarks. It is safe to always publish this dummy workunit at the end as we do not want to update the
     * ActualHighWatermark till all other partitions are successfully published. Hence we use nullsLast ordering.
     */
    private static final Ordering<WorkUnitState> PARTITION_PUBLISH_ORDERING = Ordering.natural().nullsLast()
            .onResultOf(new Function<WorkUnitState, String>() {
                public String apply(@Nonnull WorkUnitState wus) {
                    return new HiveWorkUnit(wus.getWorkunit()).getPartitionName().orNull();
                }
            });
}