org.apache.nifi.processors.kite.TestInferAvroSchema.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nifi.processors.kite.TestInferAvroSchema.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.nifi.processors.kite;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

public class TestInferAvroSchema {

    private TestRunner runner = null;

    @Before
    public void setup() {
        runner = TestRunners.newTestRunner(InferAvroSchema.class);

        // Prepare the common setup.
        runner.assertNotValid();

        runner.setProperty(InferAvroSchema.INPUT_CONTENT_TYPE, InferAvroSchema.USE_MIME_TYPE);
        runner.setProperty(InferAvroSchema.GET_CSV_HEADER_DEFINITION_FROM_INPUT, "true");
        runner.setProperty(InferAvroSchema.SCHEMA_DESTINATION, InferAvroSchema.DESTINATION_CONTENT);
        runner.setProperty(InferAvroSchema.HEADER_LINE_SKIP_COUNT, "0");
        runner.setProperty(InferAvroSchema.ESCAPE_STRING, "\\");
        runner.setProperty(InferAvroSchema.QUOTE_STRING, "'");
        runner.setProperty(InferAvroSchema.RECORD_NAME, "org.apache.nifi.contact");
        runner.setProperty(InferAvroSchema.CHARSET, "UTF-8");
        runner.setProperty(InferAvroSchema.PRETTY_AVRO_OUTPUT, "true");
    }

    @Test
    public void testRecordName() throws Exception {

        // Dot at the end is invalid
        runner.setProperty(InferAvroSchema.RECORD_NAME, "org.apache.nifi.contact.");
        runner.assertNotValid();
        // Dashes are invalid
        runner.setProperty(InferAvroSchema.RECORD_NAME, "avro-schema");
        runner.assertNotValid();
        // Name cannot start with a digit
        runner.setProperty(InferAvroSchema.RECORD_NAME, "1Record");
        runner.assertNotValid();
        // Name cannot start with a dot
        runner.setProperty(InferAvroSchema.RECORD_NAME, ".record");
        runner.assertNotValid();

        runner.setProperty(InferAvroSchema.RECORD_NAME, "avro_schema");
        runner.assertValid();
        runner.setProperty(InferAvroSchema.RECORD_NAME, "org.apache.nifi.contact");
        runner.assertValid();
        runner.setProperty(InferAvroSchema.RECORD_NAME, "${filename}"); // EL is valid, although its value may not be when evaluated
        runner.assertValid();
    }

    @Test
    public void inferAvroSchemaFromHeaderDefinitionOfCSVFile() throws Exception {

        runner.assertValid();

        Map<String, String> attributes = new HashMap<>();
        attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv");
        runner.enqueue(new File("src/test/resources/Shapes_Header.csv").toPath(), attributes);

        runner.run();
        runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
        runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
        runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
        runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);

        MockFlowFile flowFile = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
        flowFile.assertContentEquals(
                unix2PlatformSpecificLineEndings(new File("src/test/resources/Shapes_header.csv.avro")));
        flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/avro-binary");
    }

    @Test
    public void inferAvroSchemaFromJSONFile() throws Exception {

        runner.assertValid();

        runner.setProperty(InferAvroSchema.INPUT_CONTENT_TYPE, InferAvroSchema.USE_MIME_TYPE);

        // Purposely set to True to test that none of the JSON file is read which would cause issues.
        runner.setProperty(InferAvroSchema.GET_CSV_HEADER_DEFINITION_FROM_INPUT, "true");
        runner.setProperty(InferAvroSchema.SCHEMA_DESTINATION, InferAvroSchema.DESTINATION_ATTRIBUTE);

        Map<String, String> attributes = new HashMap<>();
        attributes.put(CoreAttributes.MIME_TYPE.key(), "application/json");
        runner.enqueue(new File("src/test/resources/Shapes.json").toPath(), attributes);

        runner.run();
        runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
        runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
        runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
        runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);

        MockFlowFile data = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
        String avroSchema = data.getAttribute(InferAvroSchema.AVRO_SCHEMA_ATTRIBUTE_NAME);
        String knownSchema = new String(
                unix2PlatformSpecificLineEndings(new File("src/test/resources/Shapes.json.avro")),
                StandardCharsets.UTF_8);
        Assert.assertEquals(avroSchema, knownSchema);

        // Since that avro schema is written to an attribute this should be teh same as the original
        data.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/json");
    }

    @Test
    public void inferAvroSchemaFromCSVFile() throws Exception {

        runner.assertValid();

        // Read in the header
        StringWriter writer = new StringWriter();
        IOUtils.copy(
                (Files.newInputStream(Paths.get("src/test/resources/ShapesHeader.csv"), StandardOpenOption.READ)),
                writer, "UTF-8");
        runner.setProperty(InferAvroSchema.CSV_HEADER_DEFINITION, writer.toString());
        runner.setProperty(InferAvroSchema.GET_CSV_HEADER_DEFINITION_FROM_INPUT, "false");

        Map<String, String> attributes = new HashMap<>();
        attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv");
        runner.enqueue(new File("src/test/resources/Shapes_NoHeader.csv").toPath(), attributes);

        runner.run();
        runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
        runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
        runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
        runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);

        MockFlowFile data = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
        data.assertContentEquals(
                unix2PlatformSpecificLineEndings(new File("src/test/resources/Shapes_header.csv.avro")));
        data.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/avro-binary");
    }

    @Test
    public void inferSchemaFormHeaderLinePropertyOfProcessor() throws Exception {

        final String CSV_HEADER_LINE = FileUtils.readFileToString(new File("src/test/resources/ShapesHeader.csv"));

        runner.assertValid();

        runner.setProperty(InferAvroSchema.GET_CSV_HEADER_DEFINITION_FROM_INPUT, "false");
        runner.setProperty(InferAvroSchema.CSV_HEADER_DEFINITION, CSV_HEADER_LINE);
        runner.setProperty(InferAvroSchema.HEADER_LINE_SKIP_COUNT, "1");

        runner.assertValid();

        Map<String, String> attributes = new HashMap<>();
        attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv");
        runner.enqueue((CSV_HEADER_LINE + "\nJane,Doe,29,55555").getBytes(), attributes);

        runner.run();
        runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
        runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
        runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);

        MockFlowFile data = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
        data.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/avro-binary");
    }

    @Test
    public void inferSchemaFromEmptyContent() throws Exception {
        runner.assertValid();

        Map<String, String> attributes = new HashMap<>();
        attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv");
        runner.enqueue("", attributes);

        runner.run();
        runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 1);
        runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 0);
        runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 0);
    }

    @Test
    public void inferAvroSchemaFromHeaderDefinitionOfCSVTabDelimitedFile() throws Exception {

        runner.setProperty(InferAvroSchema.DELIMITER, "\\t");
        runner.assertValid();

        Map<String, String> attributes = new HashMap<>();
        attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv");
        runner.enqueue(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath(), attributes);

        runner.run();
        runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
        runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
        runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
        runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);

        MockFlowFile flowFile = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
        flowFile.assertContentEquals(
                unix2PlatformSpecificLineEndings(new File("src/test/resources/Shapes_header.csv.avro")));
        flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/avro-binary");
    }

    @Test
    public void inferAvroSchemaFromHeaderDefinitionOfCSVTabDelimitedFileNegativeTest() throws Exception {

        // Inproper InferAvroSchema.DELIMITER > original goes to InferAvroSchema.REL_FAILURE
        runner.setProperty(InferAvroSchema.DELIMITER, ";");
        runner.assertValid();

        Map<String, String> attributes = new HashMap<>();
        attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv");
        runner.enqueue(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath(), attributes);

        runner.run();
        runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
        runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 1);
        runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 0);
        runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 0);

        MockFlowFile flowFile = runner.getFlowFilesForRelationship(InferAvroSchema.REL_FAILURE).get(0);
        flowFile.assertContentEquals(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath());
        flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "text/csv");
    }

    @Test
    public void specifyCSVparametersInExpressionLanguage() throws Exception {
        runner.setProperty(InferAvroSchema.DELIMITER, "${csv.delimiter}");
        runner.setProperty(InferAvroSchema.ESCAPE_STRING, "${csv.escape}");
        runner.setProperty(InferAvroSchema.QUOTE_STRING, "${csv.quote}");
        runner.setProperty(InferAvroSchema.CHARSET, "${csv.charset}");
        runner.setProperty(InferAvroSchema.GET_CSV_HEADER_DEFINITION_FROM_INPUT, "true");

        runner.assertValid();

        @SuppressWarnings("serial")
        Map<String, String> attributes = new HashMap<String, String>() {
            {
                put("csv.delimiter", ",");
                put("csv.escape", "\\");
                put("csv.quote", "\"");
                put("csv.charset", "UTF-8");
                put(CoreAttributes.MIME_TYPE.key(), "text/csv");
            }
        };

        runner.enqueue(new File("src/test/resources/Shapes_Header.csv").toPath(), attributes);

        runner.run();
        runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
        runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
        runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
        runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);

        MockFlowFile flowFile = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
        flowFile.assertContentEquals(
                unix2PlatformSpecificLineEndings(new File("src/test/resources/Shapes_header.csv.avro")));
        flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/avro-binary");

    }

    @Test
    public void specifyJsonParametersInExpressionLanguage() throws Exception {
        runner.assertValid();
        runner.setProperty(InferAvroSchema.INPUT_CONTENT_TYPE, InferAvroSchema.USE_MIME_TYPE);

        // Purposely set to True to test that none of the JSON file is read which would cause issues.
        runner.setProperty(InferAvroSchema.GET_CSV_HEADER_DEFINITION_FROM_INPUT, "true");
        runner.setProperty(InferAvroSchema.SCHEMA_DESTINATION, InferAvroSchema.DESTINATION_ATTRIBUTE);
        runner.setProperty(InferAvroSchema.RECORD_NAME, "${record.name}");
        runner.setProperty(InferAvroSchema.NUM_RECORDS_TO_ANALYZE, "${records.analyze}");

        Map<String, String> attributes = new HashMap<>();
        attributes.put(CoreAttributes.MIME_TYPE.key(), "application/json");
        attributes.put("record.name", "myrecord");
        attributes.put("records.analyze", "2");
        runner.enqueue(new File("src/test/resources/Shapes.json").toPath(), attributes);

        runner.run();
        runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
        runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
        runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
        runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);

        MockFlowFile data = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
        String avroSchema = data.getAttribute(InferAvroSchema.AVRO_SCHEMA_ATTRIBUTE_NAME);
        Assert.assertTrue(avroSchema.contains("\"name\" : \"myrecord\""));

        // Since that avro schema is written to an attribute this should be teh same as the original
        data.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/json");
    }

    static byte[] unix2PlatformSpecificLineEndings(final File file) throws IOException {
        try (final BufferedInputStream in = new BufferedInputStream(new FileInputStream(file));
                final ByteArrayOutputStream out = new ByteArrayOutputStream()) {
            byte eol[] = System.lineSeparator().getBytes(StandardCharsets.UTF_8);
            int justRead;
            while ((justRead = in.read()) != -1) {
                if (justRead == '\n') {
                    out.write(eol);
                } else {
                    out.write(justRead);
                }
            }
            return out.toByteArray();
        }
    }

}