com.uber.hoodie.TestHoodieClientOnCopyOnWriteStorage.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.hoodie.TestHoodieClientOnCopyOnWriteStorage.java

Source

/*
 * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *          http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

import com.uber.hoodie.common.HoodieClientTestUtils;
import com.uber.hoodie.common.HoodieTestDataGenerator;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.TableFileSystemView;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.ParquetUtils;
import com.uber.hoodie.config.HoodieCompactionConfig;
import com.uber.hoodie.config.HoodieStorageConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.table.HoodieTable;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
import java.util.stream.Collectors;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaRDD;
import org.junit.Test;
import scala.Option;

@SuppressWarnings("unchecked")
public class TestHoodieClientOnCopyOnWriteStorage extends TestHoodieClientBase {

    /**
     * Test Auto Commit behavior for HoodieWriteClient insert API
     */
    @Test
    public void testAutoCommitOnInsert() throws Exception {
        testAutoCommit(HoodieWriteClient::insert, false);
    }

    /**
     * Test Auto Commit behavior for HoodieWriteClient insertPrepped API
     */
    @Test
    public void testAutoCommitOnInsertPrepped() throws Exception {
        testAutoCommit(HoodieWriteClient::insertPreppedRecords, true);
    }

    /**
     * Test Auto Commit behavior for HoodieWriteClient upsert API
     */
    @Test
    public void testAutoCommitOnUpsert() throws Exception {
        testAutoCommit(HoodieWriteClient::upsert, false);
    }

    /**
     * Test Auto Commit behavior for HoodieWriteClient upsert Prepped API
     */
    @Test
    public void testAutoCommitOnUpsertPrepped() throws Exception {
        testAutoCommit(HoodieWriteClient::upsertPreppedRecords, true);
    }

    /**
     * Test Auto Commit behavior for HoodieWriteClient bulk-insert API
     */
    @Test
    public void testAutoCommitOnBulkInsert() throws Exception {
        testAutoCommit(HoodieWriteClient::bulkInsert, false);
    }

    /**
     * Test Auto Commit behavior for HoodieWriteClient bulk-insert prepped API
     */
    @Test
    public void testAutoCommitOnBulkInsertPrepped() throws Exception {
        testAutoCommit((writeClient, recordRDD, commitTime) -> writeClient.bulkInsertPreppedRecords(recordRDD,
                commitTime, Option.empty()), true);
    }

    /**
     * Test auto-commit by applying write function
     *
     * @param writeFn One of HoodieWriteClient Write API
     * @throws Exception in case of failure
     */
    private void testAutoCommit(
            Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> writeFn,
            boolean isPrepped) throws Exception {
        // Set autoCommit false
        HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build();
        HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);

        String prevCommitTime = "000";
        String newCommitTime = "001";
        int numRecords = 200;
        JavaRDD<WriteStatus> result = insertFirstBatch(cfg, client, newCommitTime, prevCommitTime, numRecords,
                writeFn, isPrepped, false, numRecords);

        assertFalse("If Autocommit is false, then commit should not be made automatically",
                HoodieTestUtils.doesCommitExist(basePath, newCommitTime));
        assertTrue("Commit should succeed", client.commit(newCommitTime, result));
        assertTrue("After explicit commit, commit file should be created",
                HoodieTestUtils.doesCommitExist(basePath, newCommitTime));
    }

    /**
     * Test De-duplication behavior for HoodieWriteClient insert API
     */
    @Test
    public void testDeduplicationOnInsert() throws Exception {
        testDeduplication(HoodieWriteClient::insert);
    }

    /**
     * Test De-duplication behavior for HoodieWriteClient bulk-insert API
     */
    @Test
    public void testDeduplicationOnBulkInsert() throws Exception {
        testDeduplication(HoodieWriteClient::bulkInsert);
    }

    /**
     * Test De-duplication behavior for HoodieWriteClient upsert API
     */
    @Test
    public void testDeduplicationOnUpsert() throws Exception {
        testDeduplication(HoodieWriteClient::upsert);
    }

    /**
     * Test Deduplication Logic for write function
     *
     * @param writeFn One of HoddieWriteClient non-prepped write APIs
     * @throws Exception in case of failure
     */
    private void testDeduplication(
            Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> writeFn)
            throws Exception {
        String newCommitTime = "001";

        String recordKey = UUID.randomUUID().toString();
        HoodieKey keyOne = new HoodieKey(recordKey, "2018-01-01");
        HoodieRecord recordOne = new HoodieRecord(keyOne,
                HoodieTestDataGenerator.generateRandomValue(keyOne, newCommitTime));

        HoodieKey keyTwo = new HoodieKey(recordKey, "2018-02-01");
        HoodieRecord recordTwo = new HoodieRecord(keyTwo,
                HoodieTestDataGenerator.generateRandomValue(keyTwo, newCommitTime));

        // Same key and partition as keyTwo
        HoodieRecord recordThree = new HoodieRecord(keyTwo,
                HoodieTestDataGenerator.generateRandomValue(keyTwo, newCommitTime));

        JavaRDD<HoodieRecord> records = jsc.parallelize(Arrays.asList(recordOne, recordTwo, recordThree), 1);

        // dedup should be done based on recordKey only
        HoodieWriteClient clientWithDummyGlobalIndex = getWriteClientWithDummyIndex(true);
        List<HoodieRecord> dedupedRecs = clientWithDummyGlobalIndex.deduplicateRecords(records, 1).collect();
        assertEquals(1, dedupedRecs.size());
        assertNodupesWithinPartition(dedupedRecs);

        // dedup should be done based on both recordKey and partitionPath
        HoodieWriteClient clientWithDummyNonGlobalIndex = getWriteClientWithDummyIndex(false);
        dedupedRecs = clientWithDummyNonGlobalIndex.deduplicateRecords(records, 1).collect();
        assertEquals(2, dedupedRecs.size());
        assertNodupesWithinPartition(dedupedRecs);

        // Perform write-action and check
        HoodieWriteClient client = new HoodieWriteClient(jsc, getConfigBuilder().combineInput(true, true).build());
        client.startCommitWithTime(newCommitTime);
        List<WriteStatus> statuses = writeFn.apply(client, records, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        assertEquals(2, statuses.size());
        assertNodupesWithinPartition(statuses.stream().map(WriteStatus::getWrittenRecords)
                .flatMap(Collection::stream).collect(Collectors.toList()));
    }

    /**
     * Build a test Hoodie WriteClient with dummy index to configure isGlobal flag
     *
     * @param isGlobal Flag to control HoodieIndex.isGlobal
     * @return Hoodie Write Client
     * @throws Exception in case of error
     */
    private HoodieWriteClient getWriteClientWithDummyIndex(final boolean isGlobal) throws Exception {
        HoodieIndex index = mock(HoodieIndex.class);
        when(index.isGlobal()).thenReturn(isGlobal);
        return new HoodieWriteClient(jsc, getConfigBuilder().build(), false, index);
    }

    /**
     * Test Upsert API
     */
    @Test
    public void testUpserts() throws Exception {
        testUpsertsInternal(getConfig(), HoodieWriteClient::upsert, false);
    }

    /**
     * Test Upsert API using temporary folders.
     */
    @Test
    public void testUpsertsWithFinalizeWrite() throws Exception {
        HoodieWriteConfig hoodieWriteConfig = getConfigBuilder().withUseTempFolderCopyOnWriteForCreate(true)
                .withUseTempFolderCopyOnWriteForMerge(true).build();
        testUpsertsInternal(hoodieWriteConfig, HoodieWriteClient::upsert, false);
    }

    /**
     * Test UpsertPrepped API
     */
    @Test
    public void testUpsertsPrepped() throws Exception {
        testUpsertsInternal(getConfig(), HoodieWriteClient::upsertPreppedRecords, true);
    }

    /**
     * Test UpsertPrepped API using temporary folders.
     */
    @Test
    public void testUpsertsPreppedWithFinalizeWrite() throws Exception {
        HoodieWriteConfig hoodieWriteConfig = getConfigBuilder().withUseTempFolderCopyOnWriteForCreate(true)
                .withUseTempFolderCopyOnWriteForMerge(true).build();
        testUpsertsInternal(hoodieWriteConfig, HoodieWriteClient::upsertPreppedRecords, true);
    }

    /**
     * Test one of HoodieWriteClient upsert(Prepped) APIs
     *
     * @param hoodieWriteConfig Write Config
     * @param writeFn One of Hoodie Write Function API
     * @throws Exception in case of error
     */
    private void testUpsertsInternal(HoodieWriteConfig hoodieWriteConfig,
            Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> writeFn,
            boolean isPrepped) throws Exception {
        HoodieWriteClient client = new HoodieWriteClient(jsc, hoodieWriteConfig);

        //Write 1 (only inserts)
        String newCommitTime = "001";
        String initCommitTime = "000";
        int numRecords = 200;
        insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords,
                HoodieWriteClient::insert, isPrepped, true, numRecords);

        // Write 2 (updates)
        String prevCommitTime = newCommitTime;
        newCommitTime = "004";
        numRecords = 100;
        String commitTimeBetweenPrevAndNew = "002";
        updateBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime,
                Optional.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime, numRecords, writeFn,
                isPrepped, true, numRecords, 200, 2);
    }

    /**
     * Tesst deletion of records
     */
    @Test
    public void testDeletes() throws Exception {
        HoodieWriteClient client = new HoodieWriteClient(jsc, getConfig());

        /**
         * Write 1 (inserts and deletes)
         * Write actual 200 insert records and ignore 100 delete records
         */
        String initCommitTime = "000";
        String newCommitTime = "001";

        final List<HoodieRecord> recordsInFirstBatch = new ArrayList<>();
        Function2<List<HoodieRecord>, String, Integer> recordGenFunction = (String commitTime,
                Integer numRecordsInThisCommit) -> {
            List<HoodieRecord> fewRecordsForInsert = dataGen.generateInserts(commitTime, 200);
            List<HoodieRecord> fewRecordsForDelete = dataGen.generateDeletes(commitTime, 100);

            recordsInFirstBatch.addAll(fewRecordsForInsert);
            recordsInFirstBatch.addAll(fewRecordsForDelete);
            return recordsInFirstBatch;
        };
        writeBatch(client, newCommitTime, initCommitTime, Optional.empty(), initCommitTime,
                //unused as genFn uses hard-coded number of inserts/updates/deletes
                -1, recordGenFunction, HoodieWriteClient::upsert, true, 200, 200, 1);

        /**
         * Write 2 (deletes+writes)
         */
        String prevCommitTime = newCommitTime;
        newCommitTime = "004";
        final List<HoodieRecord> recordsInSecondBatch = new ArrayList<>();

        recordGenFunction = (String commitTime, Integer numRecordsInThisCommit) -> {
            List<HoodieRecord> fewRecordsForDelete = recordsInFirstBatch.subList(0, 50);
            List<HoodieRecord> fewRecordsForUpdate = recordsInFirstBatch.subList(50, 100);
            recordsInSecondBatch.addAll(dataGen.generateDeletesFromExistingRecords(fewRecordsForDelete));
            recordsInSecondBatch.addAll(fewRecordsForUpdate);
            return recordsInSecondBatch;
        };
        writeBatch(client, newCommitTime, prevCommitTime, Optional.empty(), initCommitTime, 100, recordGenFunction,
                HoodieWriteClient::upsert, true, 50, 150, 2);
    }

    /**
     * Test scenario of new file-group getting added during upsert()
     */
    @Test
    public void testSmallInsertHandlingForUpserts() throws Exception {
        final String testPartitionPath = "2016/09/26";
        final int insertSplitLimit = 100;
        // setup the small file handling params
        HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit); // hold upto 200 records max
        dataGen = new HoodieTestDataGenerator(new String[] { testPartitionPath });

        HoodieWriteClient client = new HoodieWriteClient(jsc, config);

        // Inserts => will write file1
        String commitTime1 = "001";
        client.startCommitWithTime(commitTime1);
        List<HoodieRecord> inserts1 = dataGen.generateInserts(commitTime1, insertSplitLimit); // this writes ~500kb
        Set<String> keys1 = HoodieClientTestUtils.getRecordKeys(inserts1);

        JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
        List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime1).collect();

        assertNoWriteErrors(statuses);

        assertEquals("Just 1 file needs to be added.", 1, statuses.size());
        String file1 = statuses.get(0).getFileId();
        assertEquals("file should contain 100 records",
                ParquetUtils
                        .readRowKeysFromParquet(jsc.hadoopConfiguration(),
                                new Path(basePath,
                                        testPartitionPath + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1)))
                        .size(),
                100);

        // Update + Inserts such that they just expand file1
        String commitTime2 = "002";
        client.startCommitWithTime(commitTime2);
        List<HoodieRecord> inserts2 = dataGen.generateInserts(commitTime2, 40);
        Set<String> keys2 = HoodieClientTestUtils.getRecordKeys(inserts2);
        List<HoodieRecord> insertsAndUpdates2 = new ArrayList<>();
        insertsAndUpdates2.addAll(inserts2);
        insertsAndUpdates2.addAll(dataGen.generateUpdates(commitTime2, inserts1));

        JavaRDD<HoodieRecord> insertAndUpdatesRDD2 = jsc.parallelize(insertsAndUpdates2, 1);
        statuses = client.upsert(insertAndUpdatesRDD2, commitTime2).collect();
        assertNoWriteErrors(statuses);

        assertEquals("Just 1 file needs to be updated.", 1, statuses.size());
        assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId());
        assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit());
        Path newFile = new Path(basePath,
                testPartitionPath + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
        assertEquals("file should contain 140 records",
                ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(), newFile).size(), 140);

        List<GenericRecord> records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), newFile);
        for (GenericRecord record : records) {
            String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
            assertEquals("only expect commit2", commitTime2,
                    record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
            assertTrue("key expected to be part of commit2",
                    keys2.contains(recordKey) || keys1.contains(recordKey));
        }

        // update + inserts such that file1 is updated and expanded, a new file2 is created.
        String commitTime3 = "003";
        client.startCommitWithTime(commitTime3);
        List<HoodieRecord> insertsAndUpdates3 = dataGen.generateInserts(commitTime3, 200);
        Set<String> keys3 = HoodieClientTestUtils.getRecordKeys(insertsAndUpdates3);
        List<HoodieRecord> updates3 = dataGen.generateUpdates(commitTime3, inserts2);
        insertsAndUpdates3.addAll(updates3);

        JavaRDD<HoodieRecord> insertAndUpdatesRDD3 = jsc.parallelize(insertsAndUpdates3, 1);
        statuses = client.upsert(insertAndUpdatesRDD3, commitTime3).collect();
        assertNoWriteErrors(statuses);

        assertEquals("2 files needs to be committed.", 2, statuses.size());
        HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
        HoodieTable table = HoodieTable.getHoodieTable(metadata, config, jsc);
        TableFileSystemView.ReadOptimizedView fileSystemView = table.getROFileSystemView();
        List<HoodieDataFile> files = fileSystemView.getLatestDataFilesBeforeOrOn(testPartitionPath, commitTime3)
                .collect(Collectors.toList());
        int numTotalInsertsInCommit3 = 0;
        int numTotalUpdatesInCommit3 = 0;
        for (HoodieDataFile file : files) {
            if (file.getFileName().contains(file1)) {
                assertEquals("Existing file should be expanded", commitTime3, file.getCommitTime());
                records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath()));
                for (GenericRecord record : records) {
                    String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
                    String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
                    if (recordCommitTime.equals(commitTime3)) {
                        if (keys2.contains(recordKey)) {
                            keys2.remove(recordKey);
                            numTotalUpdatesInCommit3++;
                        } else {
                            numTotalInsertsInCommit3++;
                        }
                    }
                }
                assertEquals("All keys added in commit 2 must be updated in commit3 correctly", 0, keys2.size());
            } else {
                assertEquals("New file must be written for commit 3", commitTime3, file.getCommitTime());
                records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath()));
                for (GenericRecord record : records) {
                    String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
                    assertEquals("only expect commit3", commitTime3,
                            record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
                    assertTrue("key expected to be part of commit3", keys3.contains(recordKey));
                }
                numTotalInsertsInCommit3 += records.size();
            }
        }
        assertEquals("Total updates in commit3 must add up", inserts2.size(), numTotalUpdatesInCommit3);
        assertEquals("Total inserts in commit3 must add up", keys3.size(), numTotalInsertsInCommit3);
    }

    /**
     * Test scenario of new file-group getting added during insert()
     */
    @Test
    public void testSmallInsertHandlingForInserts() throws Exception {

        final String testPartitionPath = "2016/09/26";
        final int insertSplitLimit = 100;
        // setup the small file handling params
        HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit); // hold upto 200 records max
        dataGen = new HoodieTestDataGenerator(new String[] { testPartitionPath });
        HoodieWriteClient client = new HoodieWriteClient(jsc, config);

        // Inserts => will write file1
        String commitTime1 = "001";
        client.startCommitWithTime(commitTime1);
        List<HoodieRecord> inserts1 = dataGen.generateInserts(commitTime1, insertSplitLimit); // this writes ~500kb
        Set<String> keys1 = HoodieClientTestUtils.getRecordKeys(inserts1);
        JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
        List<WriteStatus> statuses = client.insert(insertRecordsRDD1, commitTime1).collect();

        assertNoWriteErrors(statuses);
        assertPartitionMetadata(new String[] { testPartitionPath }, fs);

        assertEquals("Just 1 file needs to be added.", 1, statuses.size());
        String file1 = statuses.get(0).getFileId();
        assertEquals("file should contain 100 records",
                ParquetUtils
                        .readRowKeysFromParquet(jsc.hadoopConfiguration(),
                                new Path(basePath,
                                        testPartitionPath + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1)))
                        .size(),
                100);

        // Second, set of Inserts should just expand file1
        String commitTime2 = "002";
        client.startCommitWithTime(commitTime2);
        List<HoodieRecord> inserts2 = dataGen.generateInserts(commitTime2, 40);
        Set<String> keys2 = HoodieClientTestUtils.getRecordKeys(inserts2);
        JavaRDD<HoodieRecord> insertRecordsRDD2 = jsc.parallelize(inserts2, 1);
        statuses = client.insert(insertRecordsRDD2, commitTime2).collect();
        assertNoWriteErrors(statuses);

        assertEquals("Just 1 file needs to be updated.", 1, statuses.size());
        assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId());
        assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit());
        Path newFile = new Path(basePath,
                testPartitionPath + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
        assertEquals("file should contain 140 records",
                ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(), newFile).size(), 140);

        List<GenericRecord> records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), newFile);
        for (GenericRecord record : records) {
            String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
            String recCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
            assertTrue("Record expected to be part of commit 1 or commit2",
                    commitTime1.equals(recCommitTime) || commitTime2.equals(recCommitTime));
            assertTrue("key expected to be part of commit 1 or commit2",
                    keys2.contains(recordKey) || keys1.contains(recordKey));
        }

        // Lots of inserts such that file1 is updated and expanded, a new file2 is created.
        String commitTime3 = "003";
        client.startCommitWithTime(commitTime3);
        List<HoodieRecord> insert3 = dataGen.generateInserts(commitTime3, 200);
        JavaRDD<HoodieRecord> insertRecordsRDD3 = jsc.parallelize(insert3, 1);
        statuses = client.insert(insertRecordsRDD3, commitTime3).collect();
        assertNoWriteErrors(statuses);
        assertEquals("2 files needs to be committed.", 2, statuses.size());

        HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
        HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
        List<HoodieDataFile> files = table.getROFileSystemView()
                .getLatestDataFilesBeforeOrOn(testPartitionPath, commitTime3).collect(Collectors.toList());
        assertEquals("Total of 2 valid data files", 2, files.size());

        int totalInserts = 0;
        for (HoodieDataFile file : files) {
            assertEquals("All files must be at commit 3", commitTime3, file.getCommitTime());
            records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath()));
            totalInserts += records.size();
        }
        assertEquals("Total number of records must add up", totalInserts,
                inserts1.size() + inserts2.size() + insert3.size());
    }

    /**
     * Test to ensure commit metadata points to valid files
     */
    @Test
    public void testCommitWritesRelativePaths() throws Exception {

        HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build();
        HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
        HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
        HoodieTable table = HoodieTable.getHoodieTable(metaClient, cfg, jsc);

        String commitTime = "000";
        client.startCommitWithTime(commitTime);

        List<HoodieRecord> records = dataGen.generateInserts(commitTime, 200);
        JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);

        JavaRDD<WriteStatus> result = client.bulkInsert(writeRecords, commitTime);

        assertTrue("Commit should succeed", client.commit(commitTime, result));
        assertTrue("After explicit commit, commit file should be created",
                HoodieTestUtils.doesCommitExist(basePath, commitTime));

        // Get parquet file paths from commit metadata
        String actionType = metaClient.getCommitActionType();
        HoodieInstant commitInstant = new HoodieInstant(false, actionType, commitTime);
        HoodieTimeline commitTimeline = metaClient.getCommitTimeline().filterCompletedInstants();
        HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
                .fromBytes(commitTimeline.getInstantDetails(commitInstant).get());
        String basePath = table.getMetaClient().getBasePath();
        Collection<String> commitPathNames = commitMetadata.getFileIdAndFullPaths(basePath).values();

        // Read from commit file
        String filename = HoodieTestUtils.getCommitFilePath(basePath, commitTime);
        FileInputStream inputStream = new FileInputStream(filename);
        String everything = IOUtils.toString(inputStream);
        HoodieCommitMetadata metadata = HoodieCommitMetadata.fromJsonString(everything.toString());
        HashMap<String, String> paths = metadata.getFileIdAndFullPaths(basePath);
        inputStream.close();

        // Compare values in both to make sure they are equal.
        for (String pathName : paths.values()) {
            assertTrue(commitPathNames.contains(pathName));
        }
    }

    /**
     * Build Hoodie Write Config for small data file sizes
     */
    private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize) {
        HoodieWriteConfig.Builder builder = getConfigBuilder();
        return builder
                .withCompactionConfig(HoodieCompactionConfig.newBuilder()
                        .compactionSmallFileSize(HoodieTestDataGenerator.SIZE_PER_RECORD * 15)
                        .insertSplitSize(insertSplitSize).build()) // tolerate upto 15 records
                .withStorageConfig(HoodieStorageConfig.newBuilder()
                        .limitFileSize(HoodieTestDataGenerator.SIZE_PER_RECORD * 20).build())
                .build();
    }
}