com.thinkbiganalytics.nifi.v2.ingest.MergeTable.java Source code

Introduction

Here is the source code for com.thinkbiganalytics.nifi.v2.ingest.MergeTable.java
Source

package com.thinkbiganalytics.nifi.v2.ingest;

/*-
 * #%L
 * thinkbig-nifi-core-processors
 * %%
 * Copyright (C) 2017 ThinkBig Analytics
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import com.thinkbiganalytics.ingest.TableMergeSyncSupport;
import com.thinkbiganalytics.nifi.processor.AbstractNiFiProcessor;
import com.thinkbiganalytics.nifi.v2.thrift.ThriftService;
import com.thinkbiganalytics.util.ColumnSpec;
import com.thinkbiganalytics.util.PartitionSpec;

import org.apache.commons.lang3.StringUtils;
import org.apache.nifi.annotation.behavior.EventDriven;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.util.StopWatch;

import java.sql.Connection;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.FEED_PARTITION;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.FIELD_SPECIFICATION;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.PARTITION_SPECIFICATION;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.REL_FAILURE;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.REL_SUCCESS;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.SOURCE_SCHEMA;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.SOURCE_TABLE;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.TARGET_SCHEMA;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.TARGET_TABLE;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.THRIFT_SERVICE;

@EventDriven
@InputRequirement(InputRequirement.Requirement.INPUT_ALLOWED)
@Tags({ "hive", "ddl", "merge", "sync", "thinkbig" })
@CapabilityDescription("Fully synchronize or Merge values from a feed partition into the target table optionally supporting de-dupe and overwriting partitions. Sync will overwrite the entire table "
        + "to match the source.")
public class MergeTable extends AbstractNiFiProcessor {

    /**
     * Merge using primary key
     **/
    public static final String STRATEGY_PK_MERGE = "PK_MERGE";

    /**
     * Merge with dedupe
     **/
    public static final String STRATEGY_DEDUPE_MERGE = "DEDUPE_AND_MERGE";
    /**
     * Merge allowing duplicates
     **/
    public static final String STRATEGY_MERGE = "MERGE";

    /**
     * Sync replace everything in table
     **/
    public static final String STRATEGY_SYNC = "SYNC";

    /**
     * Rolling SYNC same as SYNC but at a partition level overwriting only partitions present in source.
     **/
    public static final String STRATEGY_ROLLING_SYNC = "ROLLING_SYNC";
    public static final PropertyDescriptor MERGE_STRATEGY = new PropertyDescriptor.Builder().name("Merge Strategy")
            .description(
                    "Specifies the algorithm used to merge. Valid values are SYNC,MERGE, PK_MERGE, DEDUPE_AND_MERGE, and ROLLING_SYNC.  Sync will completely overwrite the target table with the source data. "
                            + "Rolling Sync will overwrite target partitions only when present in source. "
                            + "Merge will append "
                            + "the data into the target partitions. Dedupe will insert into the target partition but ensure no duplicate rows are remaining. PK Merge will insert or update existing rows "
                            + "matching the" + " same primary key.")
            .required(true).expressionLanguageSupported(true)
            .allowableValues(STRATEGY_MERGE, STRATEGY_DEDUPE_MERGE, STRATEGY_PK_MERGE, STRATEGY_SYNC,
                    STRATEGY_ROLLING_SYNC, "${metadata.table.targetMergeStrategy}")
            .defaultValue("${metadata.table.targetMergeStrategy}").build();
    public static final PropertyDescriptor HIVE_CONFIGURATIONS = new PropertyDescriptor.Builder()
            .name("Hive Configurations")
            .description("Pipe separated list of Hive Configurations that you would like to set for Hive queries ")
            .required(false).addValidator(StandardValidators.NON_EMPTY_VALIDATOR).expressionLanguageSupported(true)
            .build();
    private final Set<Relationship> relationships;
    private final List<PropertyDescriptor> propDescriptors;

    public MergeTable() {
        final Set<Relationship> r = new HashSet<>();
        r.add(REL_SUCCESS);
        r.add(REL_FAILURE);
        relationships = Collections.unmodifiableSet(r);

        final List<PropertyDescriptor> pds = new ArrayList<>();
        pds.add(THRIFT_SERVICE);
        pds.add(MERGE_STRATEGY);
        pds.add(SOURCE_SCHEMA);
        pds.add(SOURCE_TABLE);
        pds.add(TARGET_SCHEMA);
        pds.add(TARGET_TABLE);
        pds.add(FEED_PARTITION);
        pds.add(PARTITION_SPECIFICATION);
        pds.add(FIELD_SPECIFICATION);
        pds.add(HIVE_CONFIGURATIONS);

        propDescriptors = Collections.unmodifiableList(pds);
    }

    @Override
    public Set<Relationship> getRelationships() {
        return relationships;
    }

    @Override
    protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
        return propDescriptors;
    }

    @Override
    public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
        final ComponentLog logger = getLog();
        FlowFile flowFile = session.get();
        if (flowFile == null) {
            return;
        }

        String PROVENANCE_EXECUTION_STATUS_KEY = context.getName() + " Execution Status";

        ThriftService thriftService = context.getProperty(THRIFT_SERVICE).asControllerService(ThriftService.class);
        String partitionSpecString = context.getProperty(PARTITION_SPECIFICATION)
                .evaluateAttributeExpressions(flowFile).getValue();
        String sourceSchema = context.getProperty(SOURCE_SCHEMA).evaluateAttributeExpressions(flowFile).getValue();
        String sourceTable = context.getProperty(SOURCE_TABLE).evaluateAttributeExpressions(flowFile).getValue();
        String targetSchema = context.getProperty(TARGET_SCHEMA).evaluateAttributeExpressions(flowFile).getValue();
        String targetTable = context.getProperty(TARGET_TABLE).evaluateAttributeExpressions(flowFile).getValue();
        String feedPartitionValue = context.getProperty(FEED_PARTITION).evaluateAttributeExpressions(flowFile)
                .getValue();
        String mergeStrategyValue = context.getProperty(MERGE_STRATEGY).evaluateAttributeExpressions(flowFile)
                .getValue();
        String hiveConfigurations = context.getProperty(HIVE_CONFIGURATIONS).evaluateAttributeExpressions(flowFile)
                .getValue();
        final ColumnSpec[] columnSpecs = Optional
                .ofNullable(
                        context.getProperty(FIELD_SPECIFICATION).evaluateAttributeExpressions(flowFile).getValue())
                .filter(StringUtils::isNotEmpty).map(ColumnSpec::createFromString).orElse(new ColumnSpec[0]);

        if (STRATEGY_PK_MERGE.equals(mergeStrategyValue) && (columnSpecs == null || columnSpecs.length == 0)) {
            getLog().error("Missing required field specification for PK merge feature");
            flowFile = session.putAttribute(flowFile, PROVENANCE_EXECUTION_STATUS_KEY,
                    "Failed: Missing required field specification for PK merge feature");
            session.transfer(flowFile, IngestProperties.REL_FAILURE);
            return;
        }

        // Maintain default for backward compatibility
        if (StringUtils.isEmpty(mergeStrategyValue)) {
            mergeStrategyValue = STRATEGY_DEDUPE_MERGE;
        }

        logger.info("Merge strategy: " + mergeStrategyValue + " Using Source: " + sourceTable + " Target: "
                + targetTable + " feed partition:" + feedPartitionValue + " partSpec: " + partitionSpecString);

        final StopWatch stopWatch = new StopWatch(true);

        try (final Connection conn = thriftService.getConnection()) {

            TableMergeSyncSupport mergeSupport = new TableMergeSyncSupport(conn);
            mergeSupport.enableDynamicPartitions();

            if (StringUtils.isNotEmpty(hiveConfigurations)) {
                mergeSupport.setHiveConf(hiveConfigurations.split("\\|"));
            }

            PartitionSpec partitionSpec = new PartitionSpec(partitionSpecString);

            if (STRATEGY_DEDUPE_MERGE.equals(mergeStrategyValue)) {
                mergeSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec,
                        feedPartitionValue, true);
            } else if (STRATEGY_MERGE.equals(mergeStrategyValue)) {
                mergeSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec,
                        feedPartitionValue, false);
            } else if (STRATEGY_SYNC.equals(mergeStrategyValue)) {
                mergeSupport.doSync(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec,
                        feedPartitionValue);
            } else if (STRATEGY_ROLLING_SYNC.equals(mergeStrategyValue)) {
                mergeSupport.doRollingSync(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec,
                        feedPartitionValue);
            } else if (STRATEGY_PK_MERGE.equals(mergeStrategyValue)) {
                mergeSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec,
                        feedPartitionValue, columnSpecs);
            } else {
                throw new UnsupportedOperationException("Failed to resolve the merge strategy");
            }

            stopWatch.stop();
            session.getProvenanceReporter().modifyContent(flowFile, "Execution completed",
                    stopWatch.getElapsed(TimeUnit.MILLISECONDS));
            flowFile = session.putAttribute(flowFile, PROVENANCE_EXECUTION_STATUS_KEY, "Successful");
            session.transfer(flowFile, REL_SUCCESS);

        } catch (final Exception e) {
            logger.error("Unable to execute merge doMerge for {} due to {}; routing to failure",
                    new Object[] { flowFile, e }, e);
            flowFile = session.putAttribute(flowFile, PROVENANCE_EXECUTION_STATUS_KEY, "Failed: " + e.getMessage());
            session.transfer(flowFile, REL_FAILURE);
        }
    }
}