org.apache.gobblin.compliance.purger.HivePurgerPublisher.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.gobblin.compliance.purger.HivePurgerPublisher.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.gobblin.compliance.purger;

import java.security.PrivilegedExceptionAction;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.thrift.TException;

import com.google.common.base.Splitter;

import lombok.extern.slf4j.Slf4j;

import org.apache.gobblin.compliance.ComplianceConfigurationKeys;
import org.apache.gobblin.compliance.ComplianceEvents;
import org.apache.gobblin.compliance.HivePartitionDataset;
import org.apache.gobblin.compliance.utils.DatasetUtils;
import org.apache.gobblin.configuration.ConfigurationKeys;
import org.apache.gobblin.configuration.State;
import org.apache.gobblin.configuration.WorkUnitState;
import org.apache.gobblin.instrumented.Instrumented;
import org.apache.gobblin.metrics.MetricContext;
import org.apache.gobblin.metrics.event.EventSubmitter;
import org.apache.gobblin.publisher.DataPublisher;
import org.apache.gobblin.source.workunit.WorkUnit;
import org.apache.gobblin.util.HostUtils;

/**
 * The Publisher moves COMMITTED WorkUnitState to SUCCESSFUL, otherwise FAILED.
 *
 * @author adsharma
 */
@Slf4j
public class HivePurgerPublisher extends DataPublisher {
    protected MetricContext metricContext;
    protected EventSubmitter eventSubmitter;
    public HiveMetaStoreClient client;

    public HivePurgerPublisher(State state) throws Exception {
        super(state);
        this.metricContext = Instrumented.getMetricContext(state, this.getClass());
        this.eventSubmitter = new EventSubmitter.Builder(this.metricContext, ComplianceEvents.NAMESPACE).build();

        initHiveMetastoreClient();
    }

    public void initHiveMetastoreClient() throws Exception {
        if (this.state.contains(ConfigurationKeys.SUPER_USER_KEY_TAB_LOCATION)) {
            String superUser = this.state.getProp(ComplianceConfigurationKeys.GOBBLIN_COMPLIANCE_SUPER_USER);
            String realm = this.state.getProp(ConfigurationKeys.KERBEROS_REALM);
            String keytabLocation = this.state.getProp(ConfigurationKeys.SUPER_USER_KEY_TAB_LOCATION);
            log.info("Establishing MetastoreClient connection using " + keytabLocation);

            UserGroupInformation.loginUserFromKeytab(HostUtils.getPrincipalUsingHostname(superUser, realm),
                    keytabLocation);
            UserGroupInformation loginUser = UserGroupInformation.getLoginUser();
            loginUser.doAs(new PrivilegedExceptionAction<Void>() {
                @Override
                public Void run() throws TException {
                    HivePurgerPublisher.this.client = new HiveMetaStoreClient(new HiveConf());
                    return null;
                }
            });
        } else {
            HivePurgerPublisher.this.client = new HiveMetaStoreClient(new HiveConf());
        }
    }

    public void initialize() {
    }

    @Override
    public void publishData(Collection<? extends WorkUnitState> states) {
        for (WorkUnitState state : states) {
            if (state.getWorkingState() == WorkUnitState.WorkingState.SUCCESSFUL) {
                state.setWorkingState(WorkUnitState.WorkingState.COMMITTED);
                submitEvent(state, ComplianceEvents.Purger.WORKUNIT_COMMITTED);
            } else {
                state.setWorkingState(WorkUnitState.WorkingState.FAILED);
                submitEvent(state, ComplianceEvents.Purger.WORKUNIT_FAILED);
            }
        }
    }

    private void submitEvent(WorkUnitState state, String name) {
        WorkUnit workUnit = state.getWorkunit();
        Map<String, String> metadata = new HashMap<>();
        String recordsRead = state.getProp(ComplianceConfigurationKeys.NUM_ROWS);
        metadata.put(ComplianceConfigurationKeys.WORKUNIT_RECORDSREAD, recordsRead);
        metadata.put(ComplianceConfigurationKeys.WORKUNIT_BYTESREAD,
                getDataSize(workUnit.getProp(ComplianceConfigurationKeys.RAW_DATA_SIZE),
                        workUnit.getProp(ComplianceConfigurationKeys.TOTAL_SIZE)));

        String partitionNameProp = workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME);
        Splitter AT_SPLITTER = Splitter.on("@").omitEmptyStrings().trimResults();
        List<String> namesList = AT_SPLITTER.splitToList(partitionNameProp);
        if (namesList.size() != 3) {
            log.warn("Not submitting event. Invalid partition name: " + partitionNameProp);
            return;
        }

        String dbName = namesList.get(0), tableName = namesList.get(1), partitionName = namesList.get(2);
        org.apache.hadoop.hive.metastore.api.Partition apiPartition = null;
        Partition qlPartition = null;
        try {
            Table table = new Table(this.client.getTable(dbName, tableName));
            apiPartition = this.client.getPartition(dbName, tableName, partitionName);
            qlPartition = new Partition(table, apiPartition);
        } catch (Exception e) {
            log.warn("Not submitting event. Failed to resolve partition '" + partitionName + "': " + e);
            e.printStackTrace();
            return;
        }

        HivePartitionDataset hivePartitionDataset = new HivePartitionDataset(qlPartition);

        String recordsWritten = DatasetUtils.getProperty(hivePartitionDataset, ComplianceConfigurationKeys.NUM_ROWS,
                ComplianceConfigurationKeys.DEFAULT_NUM_ROWS);

        String recordsPurged = Long.toString((Long.parseLong(recordsRead) - Long.parseLong(recordsWritten)));
        metadata.put(ComplianceConfigurationKeys.WORKUNIT_RECORDSWRITTEN, recordsWritten);
        metadata.put(ComplianceConfigurationKeys.WORKUNIT_BYTESWRITTEN,
                getDataSize(
                        DatasetUtils.getProperty(hivePartitionDataset, ComplianceConfigurationKeys.RAW_DATA_SIZE,
                                ComplianceConfigurationKeys.DEFAULT_RAW_DATA_SIZE),
                        DatasetUtils.getProperty(hivePartitionDataset, ComplianceConfigurationKeys.TOTAL_SIZE,
                                ComplianceConfigurationKeys.DEFAULT_TOTAL_SIZE)));

        metadata.put(DatasetMetrics.DATABASE_NAME, hivePartitionDataset.getDbName());
        metadata.put(DatasetMetrics.TABLE_NAME, hivePartitionDataset.getTableName());
        metadata.put(DatasetMetrics.PARTITION_NAME, hivePartitionDataset.getName());
        metadata.put(DatasetMetrics.RECORDS_PURGED, recordsPurged);

        this.eventSubmitter.submit(name, metadata);
    }

    private String getDataSize(String rawDataSize, String totalDataSize) {
        long rawDataSizeVal = Long.parseLong(rawDataSize);
        long totalDataSizeVal = Long.parseLong(totalDataSize);
        long dataSize = totalDataSizeVal;
        if (totalDataSizeVal <= 0) {
            dataSize = rawDataSizeVal;
        }
        return Long.toString(dataSize);
    }

    public void publishMetadata(Collection<? extends WorkUnitState> states) {
    }

    @Override
    public void close() {
    }

    public static class DatasetMetrics {
        public static final String DATABASE_NAME = "HiveDatabaseName";
        public static final String TABLE_NAME = "HiveTableName";
        public static final String PARTITION_NAME = "HivePartitionName";
        public static final String RECORDS_PURGED = "RecordsPurged";
    }
}