org.apache.nifi.processors.standard.AttributesToCSV.java Source code

Introduction

Here is the source code for org.apache.nifi.processors.standard.AttributesToCSV.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nifi.processors.standard;

import org.apache.commons.text.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.behavior.SideEffectFree;
import org.apache.nifi.annotation.behavior.SupportsBatching;
import org.apache.nifi.annotation.behavior.EventDriven;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.annotation.lifecycle.OnScheduled;
import org.apache.nifi.components.AllowableValue;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.expression.ExpressionLanguageScope;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.processor.AbstractProcessor;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessorInitializationContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;

import java.util.Map;
import java.util.Set;
import java.util.HashSet;
import java.util.List;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.Collections;
import java.util.Arrays;
import java.util.ArrayList;

@EventDriven
@SideEffectFree
@SupportsBatching
@Tags({ "csv", "attributes", "flowfile" })
@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
@CapabilityDescription("Generates a CSV representation of the input FlowFile Attributes. The resulting CSV "
        + "can be written to either a newly generated attribute named 'CSVAttributes' or written to the FlowFile as content.  "
        + "If the attribute value contains a comma, newline or double quote, then the attribute value will be "
        + "escaped with double quotes.  Any double quote characters in the attribute value are escaped with "
        + "another double quote.")
@WritesAttributes({ @WritesAttribute(attribute = "CSVSchema", description = "CSV representation of the Schema"),
        @WritesAttribute(attribute = "CSVData", description = "CSV representation of Attributes") })

public class AttributesToCSV extends AbstractProcessor {
    private static final String DATA_ATTRIBUTE_NAME = "CSVData";
    private static final String SCHEMA_ATTRIBUTE_NAME = "CSVSchema";
    private static final String OUTPUT_SEPARATOR = ",";
    private static final String OUTPUT_MIME_TYPE = "text/csv";
    private static final String SPLIT_REGEX = OUTPUT_SEPARATOR + "(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)";

    static final AllowableValue OUTPUT_OVERWRITE_CONTENT = new AllowableValue("flowfile-content",
            "flowfile-content", "The resulting CSV string will be placed into the content of the flowfile."
                    + "Existing flowfile context will be overwritten. 'CSVData' will not be written to at all (neither null nor empty string).");
    static final AllowableValue OUTPUT_NEW_ATTRIBUTE = new AllowableValue("flowfile-attribute",
            "flowfile-attribute", "The resulting CSV string will be placed into a new flowfile"
                    + " attribute named 'CSVData'.  The content of the flowfile will not be changed.");

    public static final PropertyDescriptor ATTRIBUTES_LIST = new PropertyDescriptor.Builder().name("attribute-list")
            .displayName("Attribute List")
            .description("Comma separated list of attributes to be included in the resulting CSV. If this value "
                    + "is left empty then all existing Attributes will be included. This list of attributes is "
                    + "case sensitive and supports attribute names that contain commas. If an attribute specified in the list is not found it will be emitted "
                    + "to the resulting CSV with an empty string or null depending on the 'Null Value' property. "
                    + "If a core attribute is specified in this list "
                    + "and the 'Include Core Attributes' property is false, the core attribute will be included. The attribute list "
                    + "ALWAYS wins.")
            .required(false).addValidator(StandardValidators.NON_EMPTY_EL_VALIDATOR)
            .expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES).build();

    public static final PropertyDescriptor ATTRIBUTES_REGEX = new PropertyDescriptor.Builder()
            .name("attributes-regex").displayName("Attributes Regular Expression")
            .description("Regular expression that will be evaluated against the flow file attributes to select "
                    + "the matching attributes. This property can be used in combination with the attributes "
                    + "list property.  The final output will contain a combination of matches found in the ATTRIBUTE_LIST and ATTRIBUTE_REGEX.")
            .required(false).expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
            .addValidator(StandardValidators.createRegexValidator(0, Integer.MAX_VALUE, true))
            .addValidator(StandardValidators.NON_EMPTY_EL_VALIDATOR).build();

    public static final PropertyDescriptor DESTINATION = new PropertyDescriptor.Builder().name("destination")
            .displayName("Destination")
            .description("Control if CSV value is written as a new flowfile attribute 'CSVData' "
                    + "or written in the flowfile content.")
            .required(true).allowableValues(OUTPUT_NEW_ATTRIBUTE, OUTPUT_OVERWRITE_CONTENT)
            .defaultValue(OUTPUT_NEW_ATTRIBUTE.getDisplayName()).build();

    public static final PropertyDescriptor INCLUDE_CORE_ATTRIBUTES = new PropertyDescriptor.Builder()
            .name("include-core-attributes").displayName("Include Core Attributes")
            .description("Determines if the FlowFile org.apache.nifi.flowfile.attributes.CoreAttributes, which are "
                    + "contained in every FlowFile, should be included in the final CSV value generated.  Core attributes "
                    + "will be added to the end of the CSVData and CSVSchema strings.  The Attribute List property "
                    + "overrides this setting.")
            .required(true).allowableValues("true", "false").addValidator(StandardValidators.BOOLEAN_VALIDATOR)
            .defaultValue("true").build();

    public static final PropertyDescriptor NULL_VALUE_FOR_EMPTY_STRING = new PropertyDescriptor.Builder()
            .name("null-value").displayName("Null Value")
            .description(
                    "If true a non existing or empty attribute will be 'null' in the resulting CSV. If false an empty "
                            + "string will be placed in the CSV")
            .required(true).allowableValues("true", "false").addValidator(StandardValidators.BOOLEAN_VALIDATOR)
            .defaultValue("false").build();
    public static final PropertyDescriptor INCLUDE_SCHEMA = new PropertyDescriptor.Builder().name("include-schema")
            .displayName("Include Schema")
            .description(
                    "If true the schema (attribute names) will also be converted to a CSV string which will either be "
                            + "applied to a new attribute named 'CSVSchema' or applied at the first row in the "
                            + "content depending on the DESTINATION property setting.")
            .required(true).allowableValues("true", "false").addValidator(StandardValidators.BOOLEAN_VALIDATOR)
            .defaultValue("false").build();

    public static final Relationship REL_SUCCESS = new Relationship.Builder().name("success")
            .description("Successfully converted attributes to CSV").build();
    public static final Relationship REL_FAILURE = new Relationship.Builder().name("failure")
            .description("Failed to convert attributes to CSV").build();

    private List<PropertyDescriptor> properties;
    private Set<Relationship> relationships;
    private volatile Boolean includeCoreAttributes;
    private volatile Set<String> coreAttributes;
    private volatile boolean destinationContent;
    private volatile boolean nullValForEmptyString;
    private volatile Pattern pattern;
    private volatile Boolean includeSchema;

    @Override
    protected void init(final ProcessorInitializationContext context) {
        final List<PropertyDescriptor> properties = new ArrayList<>();
        properties.add(ATTRIBUTES_LIST);
        properties.add(ATTRIBUTES_REGEX);
        properties.add(DESTINATION);
        properties.add(INCLUDE_CORE_ATTRIBUTES);
        properties.add(NULL_VALUE_FOR_EMPTY_STRING);
        properties.add(INCLUDE_SCHEMA);
        this.properties = Collections.unmodifiableList(properties);

        final Set<Relationship> relationships = new HashSet<>();
        relationships.add(REL_SUCCESS);
        relationships.add(REL_FAILURE);
        this.relationships = Collections.unmodifiableSet(relationships);
    }

    @Override
    protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
        return properties;
    }

    @Override
    public Set<Relationship> getRelationships() {
        return relationships;
    }

    private Map<String, String> buildAttributesMapForFlowFile(FlowFile ff, Set<String> attributes,
            Pattern attPattern) {
        Map<String, String> result;
        Map<String, String> ffAttributes = ff.getAttributes();
        result = new LinkedHashMap<>(ffAttributes.size());

        if (!attributes.isEmpty() || attPattern != null) {
            if (!attributes.isEmpty()) {
                //the user gave a list of attributes
                for (String attribute : attributes) {
                    String val = ff.getAttribute(attribute);
                    if (val != null && !val.isEmpty()) {
                        result.put(attribute, val);
                    } else {
                        if (nullValForEmptyString) {
                            result.put(attribute, "null");
                        } else {
                            result.put(attribute, "");
                        }
                    }
                }
            }
            if (attPattern != null) {
                for (Map.Entry<String, String> e : ff.getAttributes().entrySet()) {
                    if (attPattern.matcher(e.getKey()).matches()) {
                        result.put(e.getKey(), e.getValue());
                    }
                }
            }
        } else {
            //the user did not give a list of attributes, take all the attributes from the flowfile
            result.putAll(ffAttributes);
        }

        //now glue on the core attributes if the user wants them.
        if (includeCoreAttributes) {
            for (String coreAttribute : coreAttributes) {
                //make sure this coreAttribute is applicable to this flowfile.
                String val = ff.getAttribute(coreAttribute);
                if (ffAttributes.containsKey(coreAttribute)) {
                    if (!StringUtils.isEmpty(val)) {
                        result.put(coreAttribute, val);
                    } else {
                        if (nullValForEmptyString) {
                            result.put(coreAttribute, "null");
                        } else {
                            result.put(coreAttribute, "");
                        }
                    }
                }
            }
        } else {
            //remove core attributes since the user does not want them, unless they are in the attribute list. Attribute List always wins
            for (String coreAttribute : coreAttributes) {
                //never override user specified attributes, even if the user has selected to exclude core attributes
                if (!attributes.contains(coreAttribute)) {
                    result.remove(coreAttribute);
                }
            }
        }
        return result;
    }

    private LinkedHashSet<String> attributeListStringToSet(String attributeList) {
        //take the user specified attribute list string and convert to list of strings.
        LinkedHashSet<String> result = new LinkedHashSet<>();
        if (StringUtils.isNotBlank(attributeList)) {
            String[] ats = attributeList.split(SPLIT_REGEX);
            for (String str : ats) {
                result.add(StringEscapeUtils.unescapeCsv(str.trim()));
            }
        }
        return result;
    }

    @OnScheduled
    public void onScheduled(ProcessContext context) {
        includeCoreAttributes = context.getProperty(INCLUDE_CORE_ATTRIBUTES).asBoolean();
        coreAttributes = Arrays.stream(CoreAttributes.values()).map(CoreAttributes::key)
                .collect(Collectors.toSet());
        destinationContent = OUTPUT_OVERWRITE_CONTENT.getValue()
                .equals(context.getProperty(DESTINATION).getValue());
        nullValForEmptyString = context.getProperty(NULL_VALUE_FOR_EMPTY_STRING).asBoolean();
        includeSchema = context.getProperty(INCLUDE_SCHEMA).asBoolean();
    }

    @Override
    public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
        final FlowFile original = session.get();
        if (original == null) {
            return;
        }
        if (context.getProperty(ATTRIBUTES_REGEX).isSet()) {
            pattern = Pattern.compile(
                    context.getProperty(ATTRIBUTES_REGEX).evaluateAttributeExpressions(original).getValue());
        }

        final Set<String> attributeList = attributeListStringToSet(
                context.getProperty(ATTRIBUTES_LIST).evaluateAttributeExpressions(original).getValue());
        final Map<String, String> atrList = buildAttributesMapForFlowFile(original, attributeList, pattern);

        //escape attribute values
        int index = 0;
        final int atrListSize = atrList.values().size() - 1;
        final StringBuilder sbValues = new StringBuilder();
        for (final Map.Entry<String, String> attr : atrList.entrySet()) {
            sbValues.append(StringEscapeUtils.escapeCsv(attr.getValue()));
            sbValues.append(index++ < atrListSize ? OUTPUT_SEPARATOR : "");
        }

        //build the csv header if needed
        final StringBuilder sbNames = new StringBuilder();
        if (includeSchema) {
            index = 0;
            for (final Map.Entry<String, String> attr : atrList.entrySet()) {
                sbNames.append(StringEscapeUtils.escapeCsv(attr.getKey()));
                sbNames.append(index++ < atrListSize ? OUTPUT_SEPARATOR : "");
            }
        }

        try {
            if (destinationContent) {
                FlowFile conFlowfile = session.write(original, (in, out) -> {
                    if (includeSchema) {
                        sbNames.append(System.getProperty("line.separator"));
                        out.write(sbNames.toString().getBytes());
                    }
                    out.write(sbValues.toString().getBytes());
                });
                conFlowfile = session.putAttribute(conFlowfile, CoreAttributes.MIME_TYPE.key(), OUTPUT_MIME_TYPE);
                session.transfer(conFlowfile, REL_SUCCESS);
            } else {
                FlowFile atFlowfile = session.putAttribute(original, DATA_ATTRIBUTE_NAME, sbValues.toString());
                if (includeSchema) {
                    session.putAttribute(original, SCHEMA_ATTRIBUTE_NAME, sbNames.toString());
                }
                session.transfer(atFlowfile, REL_SUCCESS);
            }
        } catch (Exception e) {
            getLogger().error(e.getMessage());
            session.transfer(original, REL_FAILURE);
        }
    }
}