com.thinkbiganalytics.nifi.v2.ingest.CreateElasticsearchBackedHiveTable.java Source code

Introduction

Here is the source code for com.thinkbiganalytics.nifi.v2.ingest.CreateElasticsearchBackedHiveTable.java
Source

package com.thinkbiganalytics.nifi.v2.ingest;

/*-
 * #%L
 * thinkbig-nifi-core-processors
 * %%
 * Copyright (C) 2017 ThinkBig Analytics
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import com.thinkbiganalytics.nifi.v2.thrift.ExecuteHQLStatement;
import com.thinkbiganalytics.nifi.v2.thrift.ThriftService;
import com.thinkbiganalytics.util.ColumnSpec;
import com.thinkbiganalytics.util.TableType;

import org.apache.commons.lang3.StringUtils;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;

import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;

import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.FEED_CATEGORY;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.FEED_NAME;
import static com.thinkbiganalytics.nifi.v2.ingest.IngestProperties.FIELD_SPECIFICATION;

/**
 * This processor indexes json data in elasticsearch
 */
@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
@Tags({ "elasticsearch", "thinkbig", "hive" })
@CapabilityDescription("Creates a table in Hive that is backed by data in Elasticsearch")
public class CreateElasticsearchBackedHiveTable extends ExecuteHQLStatement {

    /**
     * List of Elasticsearch nodes to connect to
     */
    public static final PropertyDescriptor NODES = new PropertyDescriptor.Builder().name("Nodes")
            .description("A comma separated list of one or more Elasticsearch nodes to be used.").required(true)
            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR).expressionLanguageSupported(true).build();

    public static final PropertyDescriptor FEED_ROOT = new PropertyDescriptor.Builder().name("Feed Root Path")
            .description("Specify the full HDFS or S3 root path for the index table.").required(true)
            .defaultValue("${s3ingest.s3.protocol}://${s3ingest.hiveBucket}/${hive.profile.root}")
            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR).expressionLanguageSupported(true).build();

    /**
     * Location of Jar file
     */
    public static final PropertyDescriptor JAR_URL = new PropertyDescriptor.Builder().name("Jar URL")
            .description("Location of Jar file to be added before table creation").required(false)
            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR).expressionLanguageSupported(true).build();

    /**
     * Property for the id to use for indexing into elasticsearch.
     */
    public static final PropertyDescriptor ID_FIELD = new PropertyDescriptor.Builder().name("IdField").description(
            "ID that you want to use for indexing into elasticsearch. If it is empty then elasticsearch will use its own ID.")
            .required(false).addValidator(StandardValidators.NON_EMPTY_VALIDATOR).expressionLanguageSupported(true)
            .build();

    public static final PropertyDescriptor USE_WAN = new PropertyDescriptor.Builder().name("Use WAN").description(
            "Whether the connector is used against an Elasticsearch instance in a cloud/restricted environment over the WAN, such as Amazon Web Services. "
                    + "In this mode, the connector disables discovery and only connects through the declared es.nodes during all operations, including reads and writes."
                    + " Note that in this mode, performance is highly affected.")
            .allowableValues("true", "false").required(true).addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
            .expressionLanguageSupported(false).defaultValue("true").build();

    public static final PropertyDescriptor AUTO_CREATE_INDEX = new PropertyDescriptor.Builder()
            .name("Auto create index")
            .description("Whether or not the Elasticsearch index can be created if id does not already exist.")
            .allowableValues("true", "false").required(true).addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
            .expressionLanguageSupported(false).defaultValue("true").build();

    public static final PropertyDescriptor FIELD_INDEX_STRING = new PropertyDescriptor.Builder()
            .name("Field Index String").description("Comma separated list of fields to be indexed.").required(true)
            .addValidator(StandardValidators.NON_EMPTY_VALIDATOR).expressionLanguageSupported(true)
            .defaultValue("${metadata.table.fieldIndexString}").build();
    private final static String KYLO_DATA_INDEX_NAME = "kylo-data";
    private final static String KYLO_DATA_INDEX_TYPE = "hive-data";
    private final Set<Relationship> relationships;
    private final List<PropertyDescriptor> propDescriptors;

    /**
     * default constructor constructs the relationship and property collections
     */
    public CreateElasticsearchBackedHiveTable() {
        final Set<Relationship> r = new HashSet<>();
        r.add(IngestProperties.REL_SUCCESS);
        r.add(IngestProperties.REL_FAILURE);
        relationships = Collections.unmodifiableSet(r);

        final List<PropertyDescriptor> pds = new ArrayList<>();
        pds.add(NODES);
        pds.add(FEED_ROOT);
        pds.add(ID_FIELD);
        pds.add(FIELD_SPECIFICATION);
        pds.add(FEED_NAME);
        pds.add(FEED_CATEGORY);
        pds.add(THRIFT_SERVICE);
        pds.add(JAR_URL);
        pds.add(USE_WAN);
        pds.add(AUTO_CREATE_INDEX);
        pds.add(FIELD_INDEX_STRING);
        propDescriptors = Collections.unmodifiableList(pds);
    }

    @Override
    public Set<Relationship> getRelationships() {
        return relationships;
    }

    @Override
    protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
        return propDescriptors;
    }

    @Override
    public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
        FlowFile flowFile = session.get();
        if (flowFile == null) {
            return;
        }

        String jarUrl = context.getProperty(JAR_URL).evaluateAttributeExpressions(flowFile).getValue();
        String useWan = context.getProperty(USE_WAN).getValue();
        String autoIndex = context.getProperty(AUTO_CREATE_INDEX).getValue();
        String idField = context.getProperty(ID_FIELD).evaluateAttributeExpressions(flowFile).getValue();

        final ColumnSpec[] columnSpecs = Optional
                .ofNullable(
                        context.getProperty(FIELD_SPECIFICATION).evaluateAttributeExpressions(flowFile).getValue())
                .filter(StringUtils::isNotEmpty).map(ColumnSpec::createFromString).orElse(new ColumnSpec[0]);
        validateArrayProperty(FIELD_SPECIFICATION.getDisplayName(), columnSpecs, session, flowFile);

        final String feedName = context.getProperty(IngestProperties.FEED_NAME)
                .evaluateAttributeExpressions(flowFile).getValue();
        validateStringProperty(FEED_NAME.getDisplayName(), feedName, session, flowFile);

        final String categoryName = context.getProperty(IngestProperties.FEED_CATEGORY)
                .evaluateAttributeExpressions(flowFile).getValue();
        validateStringProperty(FEED_CATEGORY.getDisplayName(), categoryName, session, flowFile);

        final String nodes = context.getProperty(NODES).evaluateAttributeExpressions(flowFile).getValue();
        validateStringProperty(NODES.getDisplayName(), nodes, session, flowFile);

        final String indexString = context.getProperty(FIELD_INDEX_STRING).evaluateAttributeExpressions(flowFile)
                .getValue();
        validateStringProperty(FIELD_INDEX_STRING.getDisplayName(), indexString, session, flowFile);

        final String feedRoot = context.getProperty(FEED_ROOT).evaluateAttributeExpressions(flowFile).getValue();
        validateStringProperty(FEED_ROOT.getDisplayName(), indexString, session, flowFile);

        List<String> hiveStatements = getHQLStatements(columnSpecs, nodes, feedRoot, feedName, categoryName, useWan,
                autoIndex, idField, jarUrl, indexString);
        final ThriftService thriftService = context.getProperty(THRIFT_SERVICE)
                .asControllerService(ThriftService.class);

        executeStatements(session, flowFile, hiveStatements.toArray(new String[hiveStatements.size()]),
                thriftService);

    }

    private void validateArrayProperty(String propertyName, Object[] values, ProcessSession session,
            FlowFile flowFile) {
        if (values == null || values.length == 0) {
            getLog().error("Missing " + propertyName + " specification");
            session.transfer(flowFile, IngestProperties.REL_FAILURE);
            return;
        }
    }

    private void validateStringProperty(String propertyName, String propertyValue, ProcessSession session,
            FlowFile flowFile) {
        if (propertyValue == null || propertyValue.isEmpty()) {
            getLog().error("Missing " + propertyName + " parameter");
            session.transfer(flowFile, IngestProperties.REL_FAILURE);
            return;
        }
    }

    public String generateHQL(String columnsSQL, String nodes, String locationRoot, String feedName,
            String categoryName, String useWan, String autoIndex, String idField) {
        // elastic search records for the kylo-data index require the kylo_schema and kylo_table
        columnsSQL = columnsSQL + ", kylo_schema string, kylo_table string";

        // Construct location
        Path path = Paths.get(locationRoot, categoryName, feedName, "index");
        String location = path.toString().replace(":/", "://");

        StringBuilder sb = new StringBuilder();
        sb.append("CREATE EXTERNAL TABLE IF NOT EXISTS ").append(categoryName).append(".").append(feedName)
                .append("_index").append(" (").append(columnsSQL).append(") ")
                .append("STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler' ").append("LOCATION '")
                .append(location).append("' ").append("TBLPROPERTIES('es.resource' = '")
                .append(KYLO_DATA_INDEX_NAME).append("/").append(KYLO_DATA_INDEX_TYPE).append("', 'es.nodes' = '")
                .append(nodes).append("', 'es.nodes.wan.only' = '").append(useWan)
                .append("', 'es.index.auto.create' = '").append(autoIndex);

        if (idField != null && !idField.isEmpty()) {
            sb.append("', 'es.mapping.id' = '").append(idField);
        }

        sb.append("')");

        return sb.toString();
    }

    public List<String> getHQLStatements(ColumnSpec[] columnSpecs, String nodes, String locationRoot,
            String feedName, String categoryName, String useWan, String autoIndex, String idField, String jarUrl,
            String indexFieldString) {

        final ColumnSpec[] partitions = {};

        TableType tableType = TableType.MASTER;

        List<String> indexFields = Arrays.asList(indexFieldString.toLowerCase().split(","));

        List<ColumnSpec> indexCols = Arrays.asList(columnSpecs).stream()
                .filter(p -> indexFields.contains(p.getName().toLowerCase())).collect(Collectors.toList());

        String columnsSQL = tableType.deriveColumnSpecification(indexCols.toArray(new ColumnSpec[indexCols.size()]),
                partitions, "");
        String hql = generateHQL(columnsSQL, nodes, locationRoot, feedName, categoryName, useWan, autoIndex,
                idField);

        List<String> hiveStatements = new ArrayList<>();

        if (jarUrl != null && !jarUrl.isEmpty()) {
            String addJar = "ADD JAR " + jarUrl;
            hiveStatements.add(addJar);
        }

        hiveStatements.add(hql);
        return hiveStatements;
    }

}