org.apache.nifi.schemaregistry.processors.CSVUtils.java Source code

Introduction

Here is the source code for org.apache.nifi.schemaregistry.processors.CSVUtils.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nifi.schemaregistry.processors;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.StringWriter;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.List;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.nifi.components.ValidationContext;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.components.Validator;
import org.apache.nifi.flowfile.FlowFile;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.node.BooleanNode;
import org.codehaus.jackson.node.DoubleNode;
import org.codehaus.jackson.node.IntNode;
import org.codehaus.jackson.node.LongNode;
import org.codehaus.jackson.node.TextNode;

/**
 * Various CSV related utility operations relevant to transforming contents of
 * the {@link FlowFile} between CSV and AVRO formats.
 */
class CSVUtils {
    /**
     * Provides a {@link Validator} to ensure that provided value is a valid
     * character.
     */
    public static final Validator CHAR_VALIDATOR = new Validator() {
        @Override
        public ValidationResult validate(String subject, String input, ValidationContext context) {
            // Allows special, escaped characters as input, which is then un-escaped and converted to a single character.
            // Examples for special characters: \t (or \u0009), \f.
            if (input.length() > 1) {
                input = StringEscapeUtils.unescapeJava(input);
            }
            return new ValidationResult.Builder().subject(subject).input(input)
                    .explanation("Only non-null single characters are supported")
                    .valid(input.length() == 1 && input.charAt(0) != 0).build();
        }
    };

    public static GenericRecord read(InputStream record, char delimiter, Schema schema, char quoteChar) {
        Record avroRecord = new GenericData.Record(schema);
        String[] parsedRecord = parseFields(convertInputStreamToString(record), delimiter, quoteChar);
        List<Field> fields = schema.getFields();
        if (parsedRecord.length != fields.size()) {
            throw new IllegalStateException(
                    "Incompatible schema. Parsed fields count does not match the count of fields from schema. "
                            + "Schema: " + schema.toString(true) + "\n Record: " + record);
        }

        for (int i = 0; i < fields.size(); i++) {
            Field field = fields.get(i);
            Type type = field.schema().getType();
            updateRecord(field, type, parsedRecord[i], avroRecord);
        }
        return avroRecord;
    }

    /**
     * Parses provided record into fields using provided delimiter. The
     * 'quoteChar' is used to ensure that if a delimiter char is in quotes it
     * will not be parsed into a separate filed.
     */
    public static String[] parseFields(String record, char delimiter, char quoteChar) {
        List<String> result = new ArrayList<String>();
        int start = 0;
        boolean inQuotes = false;
        for (int i = 0; i < record.length(); i++) {
            if (record.charAt(i) == quoteChar) {
                inQuotes = !inQuotes;
            }
            boolean atLastChar = (i == record.length() - 1);
            if (atLastChar) {
                if (record.charAt(i) == delimiter) {
                    //missing last column value, add NULL
                    result.add(record.substring(start, i));
                    result.add(null);
                } else {
                    result.add(record.substring(start));
                }
            } else if (record.charAt(i) == delimiter && !inQuotes) {
                if (start == i) {
                    //There is no value, so add NULL to indicated the absence of a value for this field.
                    result.add(null);
                } else {
                    result.add(record.substring(start, i));
                }
                start = i + 1;
            }
        }
        return result.toArray(new String[] {});
    }

    /**
     * Writes {@link GenericRecord} as CSV (delimited) record to the
     * {@link OutputStream} using provided delimiter.
     */
    public static void write(GenericRecord record, char delimiter, OutputStream out) {
        List<Field> fields = record.getSchema().getFields();

        String delimiterToUse = "";
        try {
            for (Field field : fields) {
                out.write(delimiterToUse.getBytes(StandardCharsets.UTF_8));
                Object fieldValue = record.get(field.name());
                if (null == fieldValue) {
                    out.write(new byte[0]);
                } else {
                    if (Type.BYTES == field.schema().getType()) {
                        // need to create it from the ByteBuffer it is serialized as.
                        // need to ensure the type is one of the logical ones we support and if so convert it.
                        if (!"decimal".contentEquals(field.getProp("logicalType"))) {
                            throw new IllegalArgumentException(
                                    "The field '" + field.name() + "' has a logical type of '"
                                            + field.getProp("logicalType") + "' that is currently not supported.");
                        }

                        JsonNode rawPrecision = field.getJsonProp("precision");
                        if (null == rawPrecision) {
                            throw new IllegalArgumentException(
                                    "The field '" + field.name() + "' is missing the required precision property");
                        }
                        int precision = rawPrecision.asInt();
                        JsonNode rawScale = field.getJsonProp("scale");
                        int scale = null == rawScale ? 0 : rawScale.asInt();

                        // write out the decimal with the precision and scale.
                        NumberFormat numberFormat = DecimalFormat.getInstance();
                        numberFormat.setGroupingUsed(false);
                        normalizeNumberFormat(numberFormat, scale, precision);
                        final String rawValue = new String(((ByteBuffer) fieldValue).array());
                        out.write(numberFormat.format(new BigDecimal(rawValue)).getBytes(StandardCharsets.UTF_8));
                    } else {
                        out.write(fieldValue.toString().getBytes(StandardCharsets.UTF_8));
                    }
                }
                if (delimiterToUse.length() == 0) {
                    delimiterToUse = String.valueOf(delimiter);
                }
            }
        } catch (IOException e) {
            throw new IllegalStateException("Failed to parse AVRO Record", e);
        }
    }

    /**
     * According to the 1.7.7 spec If a logical type is invalid, for example a
     * decimal with scale greater than its precision,then implementations should
     * ignore the logical type and use the underlying Avro type.
     */
    private static void normalizeNumberFormat(NumberFormat numberFormat, int scale, int precision) {
        if (scale < precision) {
            // write out with the specified precision and scale.
            numberFormat.setMaximumIntegerDigits(precision);
            numberFormat.setMaximumFractionDigits(scale);
            numberFormat.setMinimumFractionDigits(scale);
        }
    }

    /**
     *
     */
    private static String convertInputStreamToString(InputStream record) {
        StringWriter writer = new StringWriter();
        try {
            IOUtils.copy(record, writer, StandardCharsets.UTF_8);
        } catch (Exception e) {
            throw new IllegalStateException("Failed to read InputStream into String", e);
        }
        return writer.toString();
    }

    /**
     *
     */
    private static ByteBuffer encodeLogicalType(final Field field, final String fieldValue) {
        String logicalType = field.getProp("logicalType");
        if (!"decimal".contentEquals(logicalType)) {
            throw new IllegalArgumentException("The field '" + field.name() + "' has a logical type of '"
                    + logicalType + "' that is currently not supported.");
        }

        JsonNode rawPrecision = field.getJsonProp("precision");
        if (null == rawPrecision) {
            throw new IllegalArgumentException(
                    "The field '" + field.name() + "' is missing the required precision property");
        }
        int precision = rawPrecision.asInt();
        JsonNode rawScale = field.getJsonProp("scale");
        int scale = null == rawScale ? 0 : rawScale.asInt();

        NumberFormat numberFormat = DecimalFormat.getInstance();
        numberFormat.setGroupingUsed(false);
        normalizeNumberFormat(numberFormat, scale, precision);
        BigDecimal decimal = null == fieldValue ? new BigDecimal(retrieveDefaultFieldValue(field).asText())
                : new BigDecimal(fieldValue);
        return ByteBuffer.wrap(numberFormat.format(decimal).getBytes(StandardCharsets.UTF_8));
    }

    /**
     *
     */
    private static JsonNode retrieveDefaultFieldValue(Field field) {
        JsonNode jsonNode = field.defaultValue();
        if (null == jsonNode) {
            throw new IllegalArgumentException("The field '" + field.name()
                    + "' is NULL and there is no default value supplied in the Avro Schema");
        }
        return jsonNode;
    }

    /**
     *
     */
    private static void updateRecord(Field field, Type type, String providedValue, Record avroRecord) {
        if (Type.NULL != type) {
            Object value;
            if (Type.INT == type) {
                value = null == providedValue ? possiblyGetDefaultValue(field, IntNode.class).getIntValue()
                        : Integer.parseInt(providedValue);
                avroRecord.put(field.name(), value);
            } else if (Type.BOOLEAN == type) {
                value = null == providedValue ? possiblyGetDefaultValue(field, BooleanNode.class).getBooleanValue()
                        : Boolean.parseBoolean(providedValue);
                avroRecord.put(field.name(), value);
            } else if (Type.DOUBLE == type) {
                value = null == providedValue ? possiblyGetDefaultValue(field, DoubleNode.class).getDoubleValue()
                        : Double.parseDouble(providedValue);
                avroRecord.put(field.name(), value);
            } else if (Type.FLOAT == type) {
                value = null == providedValue ? possiblyGetDefaultValue(field, DoubleNode.class).getDoubleValue()
                        : Float.parseFloat(providedValue);
                avroRecord.put(field.name(), value);
            } else if (Type.LONG == type) {
                value = null == providedValue ? possiblyGetDefaultValue(field, LongNode.class).getLongValue()
                        : Long.parseLong(providedValue);
                avroRecord.put(field.name(), value);
            } else if (Type.STRING == type) {
                value = null == providedValue ? possiblyGetDefaultValue(field, TextNode.class).getTextValue()
                        : providedValue;
                avroRecord.put(field.name(), value);
            } else if (Type.BYTES == type) {
                value = encodeLogicalType(field, providedValue);
                avroRecord.put(field.name(), value);
            } else if (Type.UNION == type) {
                field.schema().getTypes()
                        .forEach(schema -> updateRecord(field, schema.getType(), providedValue, avroRecord));
            } else if (Type.ARRAY == type || Type.ENUM == type || Type.FIXED == type || Type.MAP == type
                    || Type.NULL == type || Type.RECORD == type) {
                throw new IllegalArgumentException("The field type '" + type + "' is not supported at the moment");
            } else {
                avroRecord.put(field.name(), providedValue);
            }
        }
    }

    /**
     * Check to see if there is a default value to use, if not will throw
     * {@link IllegalArgumentException}
     */
    private static <T extends JsonNode> JsonNode possiblyGetDefaultValue(Field field,
            Class<T> expectedDefaultType) {
        JsonNode jsonNode = retrieveDefaultFieldValue(field);
        if (field.schema().getType() != Type.UNION && !expectedDefaultType.isAssignableFrom(jsonNode.getClass())) {
            // since we do not support schema evolution here we need to throw an
            // exception here as the data is in error.
            throw new IllegalArgumentException("The field '" + field.name() + "' has a default value that "
                    + "does not match the field type. Field Type is: '" + expectedDefaultType.getName()
                    + "' and the " + "default value type is: '" + field.defaultValue().toString());
        }
        return jsonNode;
    }
}