org.apache.nifi.repository.schema.TestSchemaRecordReaderWriter.java Source code

Introduction

Here is the source code for org.apache.nifi.repository.schema.TestSchemaRecordReaderWriter.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nifi.repository.schema;

import static org.apache.nifi.repository.schema.SchemaRecordWriter.MAX_ALLOWED_UTF_LENGTH;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.junit.Test;

public class TestSchemaRecordReaderWriter {

    private static Character utfCharOneByte = '$';
    private static Character utfCharTwoByte = '';
    private static Character utfCharThreeByte = '';
    private static String utfStringOneByte = utfCharOneByte.toString();
    private static String utfStringTwoByte = utfCharTwoByte.toString();
    private static String utfStringThreeByte = utfCharThreeByte.toString();

    @Test
    @SuppressWarnings("unchecked")
    public void testRoundTrip() throws IOException {
        // Create a 'complex' record that contains two different types of fields - a string and an int.
        final List<RecordField> complexFieldList1 = new ArrayList<>();
        complexFieldList1.add(createField("string field", FieldType.STRING));
        complexFieldList1.add(createField("int field", FieldType.INT));
        final ComplexRecordField complexField1 = new ComplexRecordField("complex1", Repetition.EXACTLY_ONE, complexFieldList1);
        final Map<RecordField, Object> complexMap1 = new LinkedHashMap<>();
        final RecordField stringField = createField("string field", FieldType.STRING);
        final RecordField intField = createField("int field", FieldType.INT);
        complexMap1.put(stringField, "apples");
        complexMap1.put(intField, 100);
        final FieldMapRecord complexRecord1 = new FieldMapRecord(complexMap1, new RecordSchema(stringField, intField));

        // Create another 'complex' record that contains two other types of fields - a long string and a long.
        final List<RecordField> complexFieldList2 = new ArrayList<>();
        complexFieldList2.add(createField("long string field", FieldType.LONG_STRING));
        complexFieldList2.add(createField("long field", FieldType.LONG));
        final ComplexRecordField complexField2 = new ComplexRecordField("complex2", Repetition.EXACTLY_ONE, complexFieldList2);
        final Map<RecordField, Object> complexMap2 = new LinkedHashMap<>();
        final RecordField longStringField = createField("long string field", FieldType.LONG_STRING);
        final RecordField longField = createField("long field", FieldType.LONG);
        complexMap2.put(longStringField, "oranges");
        complexMap2.put(longField, Long.MAX_VALUE);
        final FieldMapRecord complexRecord2 = new FieldMapRecord(complexMap2, new RecordSchema(longStringField, longField));

        // Create a Union Field that indicates that the type could be either 'complex 1' or 'complex 2'
        final UnionRecordField unionRecordField = new UnionRecordField("union", Repetition.ZERO_OR_MORE, Arrays.asList(new RecordField[] {complexField1, complexField2}));

        // Create a Record Schema
        final List<RecordField> fields = new ArrayList<>();
        fields.add(new SimpleRecordField("int", FieldType.INT, Repetition.ZERO_OR_ONE));
        fields.add(new SimpleRecordField("int present", FieldType.INT, Repetition.ZERO_OR_ONE));
        fields.add(new SimpleRecordField("boolean", FieldType.BOOLEAN, Repetition.ZERO_OR_ONE));
        fields.add(new SimpleRecordField("boolean present", FieldType.BOOLEAN, Repetition.ZERO_OR_ONE));
        fields.add(new SimpleRecordField("byte array", FieldType.BYTE_ARRAY, Repetition.ZERO_OR_ONE));
        fields.add(new SimpleRecordField("byte array present", FieldType.BYTE_ARRAY, Repetition.ZERO_OR_ONE));
        fields.add(new SimpleRecordField("long", FieldType.LONG, Repetition.ZERO_OR_ONE));
        fields.add(new SimpleRecordField("long present", FieldType.LONG, Repetition.ZERO_OR_ONE));
        fields.add(new SimpleRecordField("string", FieldType.STRING, Repetition.ZERO_OR_ONE));
        fields.add(new SimpleRecordField("string present", FieldType.STRING, Repetition.ZERO_OR_ONE));
        fields.add(new SimpleRecordField("long string", FieldType.LONG_STRING, Repetition.ZERO_OR_ONE));
        fields.add(new SimpleRecordField("long string present", FieldType.LONG_STRING, Repetition.ZERO_OR_ONE));
        fields.add(new ComplexRecordField("complex present", Repetition.EXACTLY_ONE,
            new SimpleRecordField("color", FieldType.STRING, Repetition.ZERO_OR_ONE),
            new SimpleRecordField("fruit", FieldType.STRING, Repetition.ZERO_OR_ONE)));
        fields.add(new MapRecordField("map present",
            new SimpleRecordField("key", FieldType.STRING, Repetition.EXACTLY_ONE),
            new SimpleRecordField("value", FieldType.INT, Repetition.EXACTLY_ONE), Repetition.ZERO_OR_ONE));
        fields.add(unionRecordField);

        final RecordSchema schema = new RecordSchema(fields);

        // Create a 'complex' record that contains two different elements.
        final RecordField colorField = createField("color", FieldType.STRING);
        final RecordField fruitField = createField("fruit", FieldType.STRING);
        final Map<RecordField, Object> complexFieldMap = new LinkedHashMap<>();
        complexFieldMap.put(colorField, "red");
        complexFieldMap.put(fruitField, "apple");

        // Create a simple map that can be used for a Map Field
        final Map<String, Integer> simpleMap = new HashMap<>();
        simpleMap.put("apples", 100);

        // Create a Map of record fields to values, so that we can create a Record to write out
        final Map<RecordField, Object> values = new LinkedHashMap<>();
        values.put(createField("int", FieldType.INT), 42);
        values.put(createField("int present", FieldType.INT), 42);
        values.put(createField("boolean present", FieldType.BOOLEAN), true);
        values.put(createField("byte array present", FieldType.BYTE_ARRAY), "Hello".getBytes());
        values.put(createField("long present", FieldType.LONG), 42L);
        values.put(createField("string present", FieldType.STRING), "Hello");
        values.put(createField("long string present", FieldType.LONG_STRING), "Long Hello");
        values.put(createField("complex present", FieldType.COMPLEX), new FieldMapRecord(complexFieldMap, new RecordSchema(colorField, fruitField)));
        values.put(new MapRecordField("map present", createField("key", FieldType.STRING), createField("value", FieldType.INT), Repetition.EXACTLY_ONE), simpleMap);
        values.put(unionRecordField, Arrays.asList(new NamedValue[] {
            new NamedValue("complex1", complexRecord1),
            new NamedValue("complex2", complexRecord2)}));

        final FieldMapRecord originalRecord = new FieldMapRecord(values, schema);

        // Write out a record and read it back in.
        try (final ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
            // Write the schema to the stream
            schema.writeTo(baos);

            // Write the record twice, to make sure that we're able to read/write multiple sequential records
            final SchemaRecordWriter writer = new SchemaRecordWriter();
            writer.writeRecord(originalRecord, baos);
            writer.writeRecord(originalRecord, baos);

            try (final InputStream in = new ByteArrayInputStream(baos.toByteArray())) {
                // Read the Schema from the stream and create a Record Reader for reading records, based on this schema
                final RecordSchema readSchema = RecordSchema.readFrom(in);
                final SchemaRecordReader reader = SchemaRecordReader.fromSchema(readSchema);

                // Read two records and verify the values.
                for (int i=0; i < 2; i++) {
                    final Record record = reader.readRecord(in);

                    assertNotNull(record);
                    assertEquals(42, record.getFieldValue("int"));
                    assertEquals(42, record.getFieldValue("int present"));
                    assertEquals(true, record.getFieldValue("boolean present"));
                    assertTrue(Arrays.equals("Hello".getBytes(), (byte[]) record.getFieldValue("byte array present")));
                    assertEquals(42L, record.getFieldValue("long present"));
                    assertEquals("Hello", record.getFieldValue("string present"));
                    assertEquals("Long Hello", record.getFieldValue("long string present"));

                    final Record complexRecord = (Record) record.getFieldValue("complex present");
                    assertEquals("red", complexRecord.getFieldValue("color"));
                    assertEquals("apple", complexRecord.getFieldValue("fruit"));

                    assertEquals(simpleMap, record.getFieldValue("map present"));

                    final List<Record> unionRecords = (List<Record>) record.getFieldValue("union");
                    assertNotNull(unionRecords);
                    assertEquals(2, unionRecords.size());

                    final Record unionRecord1 = unionRecords.get(0);
                    assertEquals("apples", unionRecord1.getFieldValue("string field"));
                    assertEquals(100, unionRecord1.getFieldValue("int field"));

                    final Record unionRecord2 = unionRecords.get(1);
                    assertEquals("oranges", unionRecord2.getFieldValue("long string field"));
                    assertEquals(Long.MAX_VALUE, unionRecord2.getFieldValue("long field"));
                }

                // Ensure that there is no more data.
                assertNull(reader.readRecord(in));
            }
        }
    }

    @Test
    @SuppressWarnings("unchecked")
    public void testUTFLargerThan64k() throws IOException {
        // Create a Record Schema
        final List<RecordField> fields = new ArrayList<>();
        fields.add(new SimpleRecordField("int present", FieldType.INT, Repetition.ZERO_OR_ONE));
        fields.add(new SimpleRecordField("string present", FieldType.STRING, Repetition.ZERO_OR_ONE));

        final RecordSchema schema = new RecordSchema(fields);

        // Create a Map of record fields to values, so that we can create a Record to write out
        final Map<RecordField, Object> values = new LinkedHashMap<>();
        values.put(createField("int present", FieldType.INT), 42);
        final String utfString = utfStringOneByte + utfStringTwoByte + utfStringThreeByte;  // 3 chars and 6 utf8 bytes
        final String seventyK = StringUtils.repeat(utfString, 21845);  // 65,535 chars and 131070 utf8 bytes
        assertTrue(seventyK.length() == 65535);
        assertTrue(seventyK.getBytes("UTF-8").length == 131070);
        values.put(createField("string present", FieldType.STRING), seventyK);

        final FieldMapRecord originalRecord = new FieldMapRecord(values, schema);

        // Write out a record and read it back in.
        try (final ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
            // Write the schema to the stream
            schema.writeTo(baos);

            // Write the record twice, to make sure that we're able to read/write multiple sequential records
            final SchemaRecordWriter writer = new SchemaRecordWriter();
            writer.writeRecord(originalRecord, baos);
            writer.writeRecord(originalRecord, baos);

            try (final InputStream in = new ByteArrayInputStream(baos.toByteArray())) {
                // Read the Schema from the stream and create a Record Reader for reading records, based on this schema
                final RecordSchema readSchema = RecordSchema.readFrom(in);
                final SchemaRecordReader reader = SchemaRecordReader.fromSchema(readSchema);

                // Read the records and verify the values.
                for (int i=0; i < 2; i++) {
                    final Record record = reader.readRecord(in);

                    assertNotNull(record);
                    assertEquals(42, record.getFieldValue("int present"));
                    assertTrue(MAX_ALLOWED_UTF_LENGTH - ((String)record.getFieldValue("string present")).getBytes("utf-8").length <= 3);
                    assertEquals(32768, ((String)record.getFieldValue("string present")).length());
                }

                // Ensure that there is no more data.
                assertNull(reader.readRecord(in));
            }
        }
    }

    @Test
    public void testSingleCharUTF8Lengths() {
        // verify handling of single characters mapping to utf8 byte strings
        assertEquals("test 1 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 0));
        assertEquals("test 2 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 0));
        assertEquals("test 3 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 0));
        assertEquals("test 1 char string truncated to 1 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 1));
        assertEquals("test 2 char string truncated to 1 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 1));
        assertEquals("test 3 char string truncated to 1 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 1));
        assertEquals("test 1 char string truncated to 2 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 2));
        assertEquals("test 2 char string truncated to 2 utf bytes should be 2", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 2));
        assertEquals("test 3 char string truncated to 2 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 2));
        assertEquals("test 1 char string truncated to 3 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 3));
        assertEquals("test 2 char string truncated to 3 utf bytes should be 2", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 3));
        assertEquals("test 3 char string truncated to 3 utf bytes should be 3", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 3));
    }

    @Test
    public void testMultiCharUTFLengths() {
        // test boundary conditions as 1, 2, and 3 UTF byte chars are included into utf limit                                                  positions used by strings
        final String testString1 = utfStringOneByte + utfStringTwoByte + utfStringThreeByte;                                                // char 'abc' utf 'abbccc'
        assertEquals("test 6 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 0)); //            utf ''
        assertEquals("test 6 char string truncated to 1 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 1)); //            utf 'a'
        assertEquals("test 6 char string truncated to 2 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 2)); //            utf 'a'
        assertEquals("test 6 char string truncated to 3 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 3)); //            utf 'abb'
        assertEquals("test 6 char string truncated to 4 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 4)); //            utf 'abb'
        assertEquals("test 6 char string truncated to 5 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 5)); //            utf 'abb'
        assertEquals("test 6 char string truncated to 6 utf bytes should be 3", 3, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 6)); //            utf 'abbccc'
    }

    @Test
    public void testSmallCharUTFLengths() throws UnsupportedEncodingException {
        final String string12b = StringUtils.repeat(utfStringOneByte + utfStringTwoByte + utfStringThreeByte, 2);

        assertEquals("test multi-char string truncated to  0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  0));
        assertEquals("test multi-char string truncated to  1 utf bytes should be 0", 1, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  1));
        assertEquals("test multi-char string truncated to  2 utf bytes should be 0", 1, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  2));
        assertEquals("test multi-char string truncated to  3 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  3));
        assertEquals("test multi-char string truncated to  4 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  4));
        assertEquals("test multi-char string truncated to  5 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  5));
        assertEquals("test multi-char string truncated to  6 utf bytes should be 0", 3, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  6));
        assertEquals("test multi-char string truncated to  7 utf bytes should be 0", 4, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  7));
        assertEquals("test multi-char string truncated to  8 utf bytes should be 0", 4, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  8));
        assertEquals("test multi-char string truncated to  9 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTF8Limit(string12b,  9));
        assertEquals("test multi-char string truncated to 10 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 10));
        assertEquals("test multi-char string truncated to 11 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 11));
        assertEquals("test multi-char string truncated to 12 utf bytes should be 0", 6, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 12));
    }

    @Test
    public void testLargeCharUTFLengths() {
        final String string64k = StringUtils.repeat(utfStringOneByte + utfStringTwoByte + utfStringThreeByte, 21845);

        assertEquals("test 64k char string should be 64k chars long", 65535, string64k.length());

        // drop half the chars going to utf of 64k bytes -- (1+1+1) * 21845 = 65535 chars which converts to (1+2+3) * 21845 = 131070 utf bytes so 1/2 is truncated
        assertEquals("test 64k char string truncated to 65,535 utf bytes should be 32768", 32768, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65535));

        // dropping bytes off the end of utf length
        assertEquals("test 64k char string truncated to 65,534 utf bytes should be 32767", 32767, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65534)); // lost 2 byte char
        assertEquals("test 64k char string truncated to 65,533 utf bytes should be 32767", 32767, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65533));
        assertEquals("test 64k char string truncated to 65,532 utf bytes should be 32766", 32766, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65532)); // lost 1 byte char
        assertEquals("test 64k char string truncated to 65,531 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65531)); // lost 3 byte char
        assertEquals("test 64k char string truncated to 65,530 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65530));
        assertEquals("test 64k char string truncated to 65,529 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65529));
        assertEquals("test 64k char string truncated to 65,528 utf bytes should be 32764", 32764, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65528)); // lost 2 byte char (again)
    }

    private SimpleRecordField createField(final String fieldName, final FieldType type) {
        return new SimpleRecordField(fieldName, type, Repetition.ZERO_OR_ONE);
    }
}