com.uber.hoodie.common.table.log.avro.AvroLogAppenderTest.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.hoodie.common.table.log.avro.AvroLogAppenderTest.java

Source

/*
 *  Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *           http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package com.uber.hoodie.common.table.log.avro;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.uber.hoodie.common.minicluster.MiniClusterUtil;
import com.uber.hoodie.common.table.log.HoodieLogAppendConfig;
import com.uber.hoodie.common.table.log.HoodieLogFile;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.SchemaTestUtil;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.commons.collections.IteratorUtils;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.stream.Collectors;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

public class AvroLogAppenderTest {
    private FileSystem fs;
    private Path partitionPath;

    @BeforeClass
    public static void setUpClass() throws IOException, InterruptedException {
        // Append is not supported in LocalFileSystem. HDFS needs to be setup.
        MiniClusterUtil.setUp();
    }

    @AfterClass
    public static void tearDownClass() {
        MiniClusterUtil.shutdown();
    }

    @Before
    public void setUp() throws IOException, InterruptedException {
        this.fs = MiniClusterUtil.fileSystem;
        TemporaryFolder folder = new TemporaryFolder();
        folder.create();
        assertTrue(fs.mkdirs(new Path(folder.getRoot().getPath())));
        this.partitionPath = new Path(folder.getRoot().getPath());
    }

    @After
    public void tearDown() throws IOException {
        fs.delete(partitionPath, true);
    }

    @Test
    public void testBasicAppend() throws IOException, URISyntaxException, InterruptedException {
        HoodieLogAppendConfig logConfig = HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
                .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
                .withBaseCommitTime("100").withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build();
        RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig);
        logAppender.append(SchemaTestUtil.generateTestRecords(0, 100).iterator());
        long size1 = logAppender.getCurrentSize();
        assertTrue("", size1 > 0);
        assertEquals("", size1, fs.getFileStatus(logConfig.getLogFile().getPath()).getLen());
        logAppender.close();

        // Close and Open again and append 100 more records
        logAppender = new RollingAvroLogAppender(logConfig);
        logAppender.append(SchemaTestUtil.generateTestRecords(100, 100).iterator());
        long size2 = logAppender.getCurrentSize();
        assertTrue("", size2 > size1);
        assertEquals("", size2, fs.getFileStatus(logConfig.getLogFile().getPath()).getLen());
        logAppender.close();

        // Close and Open again and append 100 more records
        logAppender = new RollingAvroLogAppender(logConfig);
        logAppender.append(SchemaTestUtil.generateTestRecords(200, 100).iterator());
        long size3 = logAppender.getCurrentSize();
        assertTrue("", size3 > size2);
        assertEquals("", size3, fs.getFileStatus(logConfig.getLogFile().getPath()).getLen());
        logAppender.close();
        // Cannot get the current size after closing the log
        try {
            logAppender.getCurrentSize();
            fail("getCurrentSize should fail after the logAppender is closed");
        } catch (IllegalStateException e) {
            // pass
        }
    }

    @Test
    public void testLeaseRecovery() throws IOException, URISyntaxException, InterruptedException {
        HoodieLogAppendConfig logConfig = HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
                .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
                .withBaseCommitTime("100").withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build();
        RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig);
        logAppender.append(SchemaTestUtil.generateTestRecords(0, 100).iterator());
        // do not close this log appender
        // logAppender.close();

        // Try opening again and append 100 more records
        logAppender = new RollingAvroLogAppender(logConfig);
        logAppender.append(SchemaTestUtil.generateTestRecords(100, 100).iterator());
        assertEquals("", logAppender.getCurrentSize(), fs.getFileStatus(logConfig.getLogFile().getPath()).getLen());
        logAppender.close();
    }

    @Test
    public void testAppendOnCorruptedBlock() throws IOException, URISyntaxException, InterruptedException {
        HoodieLogAppendConfig logConfig = HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
                .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
                .withBaseCommitTime("100").withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build();
        RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig);
        logAppender.append(SchemaTestUtil.generateTestRecords(0, 100).iterator());
        logAppender.close();

        // Append some arbit byte[] to thee end of the log (mimics a partially written commit)
        assertTrue(fs.exists(logConfig.getLogFile().getPath()));
        fs = FileSystem.get(fs.getConf());
        FSDataOutputStream outputStream = fs.append(logConfig.getLogFile().getPath(), logConfig.getBufferSize());
        outputStream.write("something-random".getBytes());
        outputStream.flush();
        outputStream.close();

        logAppender = new RollingAvroLogAppender(logConfig);
        logAppender.append(SchemaTestUtil.generateTestRecords(100, 100).iterator());
        logAppender.close();
    }

    @SuppressWarnings("unchecked")
    @Test
    public void testBasicWriteAndRead() throws IOException, URISyntaxException, InterruptedException {
        HoodieLogAppendConfig logConfig = HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
                .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
                .withBaseCommitTime("100").withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build();
        RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig);
        long size1 = logAppender.getCurrentSize();

        List<IndexedRecord> inputRecords = SchemaTestUtil.generateTestRecords(0, 100);
        logAppender.append(inputRecords.iterator());
        logAppender.close();

        AvroLogReader logReader = new AvroLogReader(logConfig.getLogFile(), fs, logConfig.getSchema());
        List<GenericRecord> result = IteratorUtils.toList(logReader.readBlock(size1));
        assertEquals("Random access should return 100 records", 100, result.size());
        assertEquals("both lists should be the same. (ordering guaranteed)", inputRecords, result);
    }

    @SuppressWarnings("unchecked")
    @Test
    public void testBasicAppendAndRead() throws IOException, URISyntaxException, InterruptedException {
        HoodieLogAppendConfig logConfig = HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
                .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
                .withBaseCommitTime("100").withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build();
        RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig);
        logAppender.append(SchemaTestUtil.generateTestRecords(0, 100).iterator());
        long size1 = logAppender.getCurrentSize();
        logAppender.close();

        // Close and Open again and append 100 more records
        logAppender = new RollingAvroLogAppender(logConfig);
        List<IndexedRecord> secondBatchInput = SchemaTestUtil.generateTestRecords(100, 100);
        logAppender.append(secondBatchInput.iterator());
        long size2 = logAppender.getCurrentSize();
        logAppender.close();

        // Close and Open again and append 100 more records
        logAppender = new RollingAvroLogAppender(logConfig);
        List<IndexedRecord> lastBatchInput = SchemaTestUtil.generateTestRecords(200, 100);
        logAppender.append(lastBatchInput.iterator());
        long size3 = logAppender.getCurrentSize();
        logAppender.close();

        AvroLogReader logReader = new AvroLogReader(logConfig.getLogFile(), fs, logConfig.getSchema());

        // Try to grab the middle block here
        List<GenericRecord> secondBatch = IteratorUtils.toList(logReader.readBlock(size1));
        assertEquals("Stream collect should return 100 records", 100, secondBatch.size());
        assertEquals("Collected list should match the input list (ordering guaranteed)", secondBatchInput,
                secondBatch);

        // Try to grab the middle block here
        List<GenericRecord> lastBatch = IteratorUtils.toList(logReader.readBlock(size2));
        assertEquals("Stream collect should return 100 records", 100, secondBatch.size());
        assertEquals("Collected list should match the input list (ordering guaranteed)", lastBatchInput, lastBatch);

        List<GenericRecord> imaginaryBatch = IteratorUtils.toList(logReader.readBlock(size3));
        assertEquals("Stream collect should return 0 records", 0, imaginaryBatch.size());
    }

    @Test
    public void testAppendAndReadOnCorruptedLog() throws IOException, URISyntaxException, InterruptedException {
        HoodieLogAppendConfig logConfig = HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
                .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
                .withBaseCommitTime("100").withSchema(SchemaTestUtil.getSimpleSchema()).withFs(fs).build();
        RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig);
        long size1 = logAppender.getCurrentSize();
        logAppender.append(SchemaTestUtil.generateTestRecords(0, 100).iterator());
        logAppender.close();

        // Append some arbit byte[] to thee end of the log (mimics a partially written commit)
        assertTrue(fs.exists(logConfig.getLogFile().getPath()));
        fs = FileSystem.get(fs.getConf());
        FSDataOutputStream outputStream = fs.append(logConfig.getLogFile().getPath(), logConfig.getBufferSize());
        outputStream.write("something-random".getBytes());
        outputStream.flush();
        outputStream.close();

        logAppender = new RollingAvroLogAppender(logConfig);
        long size2 = logAppender.getCurrentSize();
        logAppender.append(SchemaTestUtil.generateTestRecords(100, 100).iterator());
        logAppender.close();

        AvroLogReader logReader = new AvroLogReader(logConfig.getLogFile(), fs, logConfig.getSchema());

        // Try to grab the middle block here
        List<GenericRecord> secondBatch = IteratorUtils.toList(logReader.readBlock(size1));
        assertEquals("Stream collect should return 100 records", 100, secondBatch.size());

        // Try to grab the last block here
        List<GenericRecord> lastBatch = IteratorUtils.toList(logReader.readBlock(size2));
        assertEquals("Stream collect should return 100 records", 100, lastBatch.size());
    }

    @Test
    public void testCompositeAvroLogReader() throws IOException, URISyntaxException, InterruptedException {
        // Set a small threshold so that every block is a new version
        HoodieLogAppendConfig logConfig = HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
                .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
                .withBaseCommitTime("100").withSchema(SchemaTestUtil.getSimpleSchema()).withSizeThreshold(500)
                .withFs(fs).build();

        RollingAvroLogAppender logAppender = new RollingAvroLogAppender(logConfig);
        long size1 = logAppender.getCurrentSize();
        List<IndexedRecord> input1 = SchemaTestUtil.generateTestRecords(0, 100);
        logAppender.append(input1.iterator());
        logAppender.close();

        // Need to rebuild config to set the latest version as path
        logConfig = HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
                .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
                .withBaseCommitTime("100").withSchema(SchemaTestUtil.getSimpleSchema()).withSizeThreshold(500)
                .withFs(fs).build();
        logAppender = new RollingAvroLogAppender(logConfig);
        long size2 = logAppender.getCurrentSize();
        List<IndexedRecord> input2 = SchemaTestUtil.generateTestRecords(100, 100);
        logAppender.append(input2.iterator());
        logAppender.close();

        logConfig = HoodieLogAppendConfig.newBuilder().onPartitionPath(partitionPath)
                .withLogFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
                .withBaseCommitTime("100").withSchema(SchemaTestUtil.getSimpleSchema()).withSizeThreshold(500)
                .withFs(fs).build();
        List<HoodieLogFile> allLogFiles = FSUtils
                .getAllLogFiles(fs, partitionPath, logConfig.getLogFile().getFileId(),
                        HoodieLogFile.DELTA_EXTENSION, logConfig.getLogFile().getBaseCommitTime())
                .collect(Collectors.toList());
        assertEquals("", 2, allLogFiles.size());

        SortedMap<Integer, List<Long>> offsets = Maps.newTreeMap();
        offsets.put(1, Lists.newArrayList(size1));
        offsets.put(2, Lists.newArrayList(size2));
        CompositeAvroLogReader reader = new CompositeAvroLogReader(partitionPath,
                logConfig.getLogFile().getFileId(), logConfig.getLogFile().getBaseCommitTime(), fs,
                logConfig.getSchema(), HoodieLogFile.DELTA_EXTENSION);
        Iterator<GenericRecord> results = reader.readBlocks(offsets);
        List<GenericRecord> totalBatch = IteratorUtils.toList(results);
        assertEquals("Stream collect should return all 200 records", 200, totalBatch.size());
        input1.addAll(input2);
        assertEquals("CompositeAvroLogReader should return 200 records from 2 versions", input1, totalBatch);
    }
}