org.apache.nifi.processors.orc.PutORCTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nifi.processors.orc.PutORCTest.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nifi.processors.orc;

import org.apache.avro.Schema;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.hive.ql.io.orc.Reader;
import org.apache.hadoop.hive.ql.io.orc.RecordReader;
import org.apache.hadoop.hive.serde2.io.DateWritableV2;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritableV2;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.log4j.BasicConfigurator;
import org.apache.nifi.avro.AvroTypeUtil;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processors.hadoop.exception.FailureException;
import org.apache.nifi.processors.hadoop.record.HDFSRecordWriter;
import org.apache.nifi.provenance.ProvenanceEventRecord;
import org.apache.nifi.provenance.ProvenanceEventType;
import org.apache.nifi.reporting.InitializationException;
import org.apache.nifi.schema.access.SchemaNotFoundException;
import org.apache.nifi.serialization.MalformedRecordException;
import org.apache.nifi.serialization.RecordReaderFactory;
import org.apache.nifi.serialization.record.MapRecord;
import org.apache.nifi.serialization.record.MockRecordParser;
import org.apache.nifi.serialization.record.RecordField;
import org.apache.nifi.serialization.record.RecordSchema;
import org.apache.nifi.serialization.record.RecordSet;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.mockito.Mockito;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.sql.Date;
import java.sql.Timestamp;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.LocalTime;
import java.time.temporal.ChronoField;
import java.util.Calendar;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TimeZone;
import java.util.function.BiFunction;

import static org.junit.Assert.assertEquals;
import static org.mockito.Matchers.any;
import static org.mockito.Mockito.when;

public class PutORCTest {

    private static final String DIRECTORY = "target";
    private static final String TEST_CONF_PATH = "src/test/resources/core-site.xml";

    private Schema schema;
    private Configuration testConf;
    private PutORC proc;
    private TestRunner testRunner;

    @BeforeClass
    public static void setupLogging() {
        BasicConfigurator.configure();
    }

    @Before
    public void setup() throws IOException {
        final String avroSchema = IOUtils.toString(new FileInputStream("src/test/resources/user.avsc"),
                StandardCharsets.UTF_8);
        schema = new Schema.Parser().parse(avroSchema);

        testConf = new Configuration();
        testConf.addResource(new Path(TEST_CONF_PATH));

        proc = new PutORC();
    }

    private void configure(final PutORC putORC, final int numUsers) throws InitializationException {
        configure(putORC, numUsers, null);
    }

    private void configure(final PutORC putORC, final int numUsers,
            final BiFunction<Integer, MockRecordParser, Void> recordGenerator) throws InitializationException {
        testRunner = TestRunners.newTestRunner(putORC);
        testRunner.setProperty(PutORC.HADOOP_CONFIGURATION_RESOURCES, TEST_CONF_PATH);
        testRunner.setProperty(PutORC.DIRECTORY, DIRECTORY);

        MockRecordParser readerFactory = new MockRecordParser();

        final RecordSchema recordSchema = AvroTypeUtil.createSchema(schema);
        for (final RecordField recordField : recordSchema.getFields()) {
            readerFactory.addSchemaField(recordField.getFieldName(), recordField.getDataType().getFieldType(),
                    recordField.isNullable());
        }

        if (recordGenerator == null) {
            for (int i = 0; i < numUsers; i++) {
                readerFactory.addRecord("name" + i, i, "blue" + i, i * 10.0);
            }
        } else {
            recordGenerator.apply(numUsers, readerFactory);
        }

        testRunner.addControllerService("mock-reader-factory", readerFactory);
        testRunner.enableControllerService(readerFactory);

        testRunner.setProperty(PutORC.RECORD_READER, "mock-reader-factory");
    }

    @Test
    public void testWriteORCWithDefaults() throws IOException, InitializationException {
        configure(proc, 100);

        final String filename = "testORCWithDefaults-" + System.currentTimeMillis();

        final Map<String, String> flowFileAttributes = new HashMap<>();
        flowFileAttributes.put(CoreAttributes.FILENAME.key(), filename);

        testRunner.setProperty(PutORC.HIVE_TABLE_NAME, "myTable");

        testRunner.enqueue("trigger", flowFileAttributes);
        testRunner.run();
        testRunner.assertAllFlowFilesTransferred(PutORC.REL_SUCCESS, 1);

        final Path orcFile = new Path(DIRECTORY + "/" + filename);

        // verify the successful flow file has the expected attributes
        final MockFlowFile mockFlowFile = testRunner.getFlowFilesForRelationship(PutORC.REL_SUCCESS).get(0);
        mockFlowFile.assertAttributeEquals(PutORC.ABSOLUTE_HDFS_PATH_ATTRIBUTE, orcFile.getParent().toString());
        mockFlowFile.assertAttributeEquals(CoreAttributes.FILENAME.key(), filename);
        mockFlowFile.assertAttributeEquals(PutORC.RECORD_COUNT_ATTR, "100");
        mockFlowFile.assertAttributeEquals(PutORC.HIVE_DDL_ATTRIBUTE,
                "CREATE EXTERNAL TABLE IF NOT EXISTS `myTable` (`name` STRING, `favorite_number` INT, `favorite_color` STRING, `scale` DOUBLE) STORED AS ORC");

        // verify we generated a provenance event
        final List<ProvenanceEventRecord> provEvents = testRunner.getProvenanceEvents();
        assertEquals(1, provEvents.size());

        // verify it was a SEND event with the correct URI
        final ProvenanceEventRecord provEvent = provEvents.get(0);
        assertEquals(ProvenanceEventType.SEND, provEvent.getEventType());
        // If it runs with a real HDFS, the protocol will be "hdfs://", but with a local filesystem, just assert the filename.
        Assert.assertTrue(provEvent.getTransitUri().endsWith(DIRECTORY + "/" + filename));

        // verify the content of the ORC file by reading it back in
        verifyORCUsers(orcFile, 100);

        // verify we don't have the temp dot file after success
        final File tempOrcFile = new File(DIRECTORY + "/." + filename);
        Assert.assertFalse(tempOrcFile.exists());

        // verify we DO have the CRC file after success
        final File crcAvroORCFile = new File(DIRECTORY + "/." + filename + ".crc");
        Assert.assertTrue(crcAvroORCFile.exists());
    }

    @Test
    public void testWriteORCWithAvroLogicalTypes() throws IOException, InitializationException {
        final String avroSchema = IOUtils.toString(
                new FileInputStream("src/test/resources/user_logical_types.avsc"), StandardCharsets.UTF_8);
        schema = new Schema.Parser().parse(avroSchema);
        Calendar now = Calendar.getInstance();
        LocalTime nowTime = LocalTime.now();
        LocalDateTime nowDateTime = LocalDateTime.now();
        LocalDate epoch = LocalDate.ofEpochDay(0);
        LocalDate nowDate = LocalDate.now();

        final int timeMillis = nowTime.get(ChronoField.MILLI_OF_DAY);
        final Timestamp timestampMillis = Timestamp.valueOf(nowDateTime);
        final Date dt = Date.valueOf(nowDate);
        final double dec = 1234.56;

        configure(proc, 10, (numUsers, readerFactory) -> {
            for (int i = 0; i < numUsers; i++) {
                readerFactory.addRecord(i, timeMillis, timestampMillis, dt, dec);
            }
            return null;
        });

        final String filename = "testORCWithDefaults-" + System.currentTimeMillis();

        final Map<String, String> flowFileAttributes = new HashMap<>();
        flowFileAttributes.put(CoreAttributes.FILENAME.key(), filename);

        testRunner.setProperty(PutORC.HIVE_TABLE_NAME, "myTable");

        testRunner.enqueue("trigger", flowFileAttributes);
        testRunner.run();
        testRunner.assertAllFlowFilesTransferred(PutORC.REL_SUCCESS, 1);

        final Path orcFile = new Path(DIRECTORY + "/" + filename);

        // verify the successful flow file has the expected attributes
        final MockFlowFile mockFlowFile = testRunner.getFlowFilesForRelationship(PutORC.REL_SUCCESS).get(0);
        mockFlowFile.assertAttributeEquals(PutORC.ABSOLUTE_HDFS_PATH_ATTRIBUTE, orcFile.getParent().toString());
        mockFlowFile.assertAttributeEquals(CoreAttributes.FILENAME.key(), filename);
        mockFlowFile.assertAttributeEquals(PutORC.RECORD_COUNT_ATTR, "10");
        // DDL will be created with field names normalized (lowercased, e.g.) for Hive by default
        mockFlowFile.assertAttributeEquals(PutORC.HIVE_DDL_ATTRIBUTE,
                "CREATE EXTERNAL TABLE IF NOT EXISTS `myTable` (`id` INT, `timemillis` INT, `timestampmillis` TIMESTAMP, `dt` DATE, `dec` DOUBLE) STORED AS ORC");

        // verify we generated a provenance event
        final List<ProvenanceEventRecord> provEvents = testRunner.getProvenanceEvents();
        assertEquals(1, provEvents.size());

        // verify it was a SEND event with the correct URI
        final ProvenanceEventRecord provEvent = provEvents.get(0);
        assertEquals(ProvenanceEventType.SEND, provEvent.getEventType());
        // If it runs with a real HDFS, the protocol will be "hdfs://", but with a local filesystem, just assert the filename.
        Assert.assertTrue(provEvent.getTransitUri().endsWith(DIRECTORY + "/" + filename));

        // verify the content of the ORC file by reading it back in
        verifyORCUsers(orcFile, 10, (x, currUser) -> {
            assertEquals((int) currUser, ((IntWritable) x.get(0)).get());
            assertEquals(timeMillis, ((IntWritable) x.get(1)).get());
            assertEquals(timestampMillis, ((TimestampWritableV2) x.get(2)).getTimestamp().toSqlTimestamp());
            final DateFormat noTimeOfDayDateFormat = new SimpleDateFormat("yyyy-MM-dd");
            noTimeOfDayDateFormat.setTimeZone(TimeZone.getTimeZone("gmt"));
            assertEquals(noTimeOfDayDateFormat.format(dt), ((DateWritableV2) x.get(3)).get().toString());
            assertEquals(dec, ((DoubleWritable) x.get(4)).get(), Double.MIN_VALUE);
            return null;
        });

        // verify we don't have the temp dot file after success
        final File tempOrcFile = new File(DIRECTORY + "/." + filename);
        Assert.assertFalse(tempOrcFile.exists());

        // verify we DO have the CRC file after success
        final File crcAvroORCFile = new File(DIRECTORY + "/." + filename + ".crc");
        Assert.assertTrue(crcAvroORCFile.exists());
    }

    @Test
    public void testValidSchemaWithELShouldBeSuccessful() throws InitializationException {
        configure(proc, 10);

        final String filename = "testValidSchemaWithELShouldBeSuccessful-" + System.currentTimeMillis();

        // don't provide my.schema as an attribute
        final Map<String, String> flowFileAttributes = new HashMap<>();
        flowFileAttributes.put(CoreAttributes.FILENAME.key(), filename);
        flowFileAttributes.put("my.schema", schema.toString());

        testRunner.enqueue("trigger", flowFileAttributes);
        testRunner.run();
        testRunner.assertAllFlowFilesTransferred(PutORC.REL_SUCCESS, 1);
    }

    @Test
    public void testMalformedRecordExceptionFromReaderShouldRouteToFailure()
            throws InitializationException, IOException, MalformedRecordException, SchemaNotFoundException {
        configure(proc, 10);

        final org.apache.nifi.serialization.RecordReader recordReader = Mockito
                .mock(org.apache.nifi.serialization.RecordReader.class);
        when(recordReader.nextRecord()).thenThrow(new MalformedRecordException("ERROR"));

        final RecordReaderFactory readerFactory = Mockito.mock(RecordReaderFactory.class);
        when(readerFactory.getIdentifier()).thenReturn("mock-reader-factory");
        when(readerFactory.createRecordReader(any(FlowFile.class), any(InputStream.class), any(ComponentLog.class)))
                .thenReturn(recordReader);

        testRunner.addControllerService("mock-reader-factory", readerFactory);
        testRunner.enableControllerService(readerFactory);
        testRunner.setProperty(PutORC.RECORD_READER, "mock-reader-factory");

        final String filename = "testMalformedRecordExceptionShouldRouteToFailure-" + System.currentTimeMillis();

        final Map<String, String> flowFileAttributes = new HashMap<>();
        flowFileAttributes.put(CoreAttributes.FILENAME.key(), filename);

        testRunner.enqueue("trigger", flowFileAttributes);
        testRunner.run();
        testRunner.assertAllFlowFilesTransferred(PutORC.REL_FAILURE, 1);
    }

    @Test
    public void testIOExceptionCreatingWriterShouldRouteToRetry() throws InitializationException {
        final PutORC proc = new PutORC() {
            @Override
            public HDFSRecordWriter createHDFSRecordWriter(ProcessContext context, FlowFile flowFile,
                    Configuration conf, Path path, RecordSchema schema) throws IOException {
                throw new IOException("IOException");
            }
        };
        configure(proc, 0);

        final String filename = "testMalformedRecordExceptionShouldRouteToFailure-" + System.currentTimeMillis();

        final Map<String, String> flowFileAttributes = new HashMap<>();
        flowFileAttributes.put(CoreAttributes.FILENAME.key(), filename);

        testRunner.enqueue("trigger", flowFileAttributes);
        testRunner.run();
        testRunner.assertAllFlowFilesTransferred(PutORC.REL_RETRY, 1);
    }

    @Test
    public void testIOExceptionFromReaderShouldRouteToRetry()
            throws InitializationException, IOException, MalformedRecordException, SchemaNotFoundException {
        configure(proc, 10);

        final RecordSet recordSet = Mockito.mock(RecordSet.class);
        when(recordSet.next()).thenThrow(new IOException("ERROR"));

        final org.apache.nifi.serialization.RecordReader recordReader = Mockito
                .mock(org.apache.nifi.serialization.RecordReader.class);
        when(recordReader.createRecordSet()).thenReturn(recordSet);
        when(recordReader.getSchema()).thenReturn(AvroTypeUtil.createSchema(schema));

        final RecordReaderFactory readerFactory = Mockito.mock(RecordReaderFactory.class);
        when(readerFactory.getIdentifier()).thenReturn("mock-reader-factory");
        when(readerFactory.createRecordReader(any(FlowFile.class), any(InputStream.class), any(ComponentLog.class)))
                .thenReturn(recordReader);

        testRunner.addControllerService("mock-reader-factory", readerFactory);
        testRunner.enableControllerService(readerFactory);
        testRunner.setProperty(PutORC.RECORD_READER, "mock-reader-factory");

        final String filename = "testMalformedRecordExceptionShouldRouteToFailure-" + System.currentTimeMillis();

        final Map<String, String> flowFileAttributes = new HashMap<>();
        flowFileAttributes.put(CoreAttributes.FILENAME.key(), filename);

        testRunner.enqueue("trigger", flowFileAttributes);
        testRunner.run();
        testRunner.assertAllFlowFilesTransferred(PutORC.REL_RETRY, 1);
    }

    @Test
    public void testIOExceptionRenamingShouldRouteToRetry() throws InitializationException {
        final PutORC proc = new PutORC() {
            @Override
            protected void rename(FileSystem fileSystem, Path srcFile, Path destFile)
                    throws IOException, InterruptedException, FailureException {
                throw new IOException("IOException renaming");
            }
        };

        configure(proc, 10);

        final String filename = "testIOExceptionRenamingShouldRouteToRetry-" + System.currentTimeMillis();

        final Map<String, String> flowFileAttributes = new HashMap<>();
        flowFileAttributes.put(CoreAttributes.FILENAME.key(), filename);

        testRunner.enqueue("trigger", flowFileAttributes);
        testRunner.run();
        testRunner.assertAllFlowFilesTransferred(PutORC.REL_RETRY, 1);

        // verify we don't have the temp dot file after success
        final File tempAvroORCFile = new File(DIRECTORY + "/." + filename);
        Assert.assertFalse(tempAvroORCFile.exists());
    }

    @Test
    public void testNestedRecords() throws Exception {
        testRunner = TestRunners.newTestRunner(proc);
        testRunner.setProperty(PutORC.HADOOP_CONFIGURATION_RESOURCES, TEST_CONF_PATH);
        testRunner.setProperty(PutORC.DIRECTORY, DIRECTORY);

        MockRecordParser readerFactory = new MockRecordParser();

        final String avroSchema = IOUtils.toString(new FileInputStream("src/test/resources/nested_record.avsc"),
                StandardCharsets.UTF_8);
        schema = new Schema.Parser().parse(avroSchema);

        final RecordSchema recordSchema = AvroTypeUtil.createSchema(schema);
        for (final RecordField recordField : recordSchema.getFields()) {
            readerFactory.addSchemaField(recordField);
        }

        Map<String, Object> nestedRecordMap = new HashMap<>();
        nestedRecordMap.put("id", 11088000000001615L);
        nestedRecordMap.put("x", "Hello World!");

        RecordSchema nestedRecordSchema = AvroTypeUtil.createSchema(schema.getField("myField").schema());
        MapRecord nestedRecord = new MapRecord(nestedRecordSchema, nestedRecordMap);
        // This gets added in to its spot in the schema, which is already named "myField"
        readerFactory.addRecord(nestedRecord);

        testRunner.addControllerService("mock-reader-factory", readerFactory);
        testRunner.enableControllerService(readerFactory);

        testRunner.setProperty(PutORC.RECORD_READER, "mock-reader-factory");

        testRunner.enqueue("trigger");
        testRunner.run();
        testRunner.assertAllFlowFilesTransferred(PutORC.REL_SUCCESS, 1);
    }

    private void verifyORCUsers(final Path orcUsers, final int numExpectedUsers) throws IOException {
        verifyORCUsers(orcUsers, numExpectedUsers, null);
    }

    private void verifyORCUsers(final Path orcUsers, final int numExpectedUsers,
            BiFunction<List<Object>, Integer, Void> assertFunction) throws IOException {
        Reader reader = OrcFile.createReader(orcUsers, OrcFile.readerOptions(testConf));
        RecordReader recordReader = reader.rows();

        TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(
                "struct<name:string,favorite_number:int,favorite_color:string,scale:double>");
        StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(typeInfo);

        int currUser = 0;
        Object nextRecord = null;
        while ((nextRecord = recordReader.next(nextRecord)) != null) {
            Assert.assertNotNull(nextRecord);
            Assert.assertTrue("Not an OrcStruct", nextRecord instanceof OrcStruct);
            List<Object> x = inspector.getStructFieldsDataAsList(nextRecord);

            if (assertFunction == null) {
                assertEquals("name" + currUser, x.get(0).toString());
                assertEquals(currUser, ((IntWritable) x.get(1)).get());
                assertEquals("blue" + currUser, x.get(2).toString());
                assertEquals(10.0 * currUser, ((DoubleWritable) x.get(3)).get(), Double.MIN_VALUE);
            } else {
                assertFunction.apply(x, currUser);
            }
            currUser++;
        }

        assertEquals(numExpectedUsers, currUser);
    }

}