com.uber.hoodie.TestHoodieClient.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.hoodie.TestHoodieClient.java

Source

/*
 * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *          http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie;

import com.google.common.collect.Iterables;

import com.uber.hoodie.common.HoodieCleanStat;
import com.uber.hoodie.common.HoodieClientTestUtils;
import com.uber.hoodie.common.HoodieTestDataGenerator;
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieFileGroup;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.model.HoodieWriteStat;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.TableFileSystemView;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.ParquetUtils;
import com.uber.hoodie.config.HoodieCompactionConfig;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieStorageConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieRollbackException;
import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.table.HoodieTable;

import java.util.Collection;
import java.util.Map;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.scheduler.SparkListener;
import org.apache.spark.scheduler.SparkListenerTaskEnd;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.util.AccumulatorV2;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;
import scala.collection.Iterator;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

public class TestHoodieClient implements Serializable {
    private transient JavaSparkContext jsc = null;
    private transient SQLContext sqlContext;
    private String basePath = null;
    private transient HoodieTestDataGenerator dataGen = null;
    private String[] partitionPaths = { "2016/01/01", "2016/02/02", "2016/06/02" };

    @Before
    public void init() throws IOException {
        // Initialize a local spark env
        SparkConf sparkConf = new SparkConf().setAppName("TestHoodieClient").setMaster("local[4]");
        jsc = new JavaSparkContext(HoodieReadClient.addHoodieSupport(sparkConf));

        //SQLContext stuff
        sqlContext = new SQLContext(jsc);

        // Create a temp folder as the base path
        TemporaryFolder folder = new TemporaryFolder();
        folder.create();
        basePath = folder.getRoot().getAbsolutePath();
        HoodieTestUtils.init(basePath);
        dataGen = new HoodieTestDataGenerator();
    }

    private HoodieWriteConfig getConfig() {
        return getConfigBuilder().build();
    }

    private HoodieWriteConfig.Builder getConfigBuilder() {
        return HoodieWriteConfig.newBuilder().withPath(basePath)
                .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
                .withCompactionConfig(
                        HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build())
                .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
                .forTable("test-trip-table")
                .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
    }

    private void assertNoWriteErrors(List<WriteStatus> statuses) {
        // Verify there are no errors
        for (WriteStatus status : statuses) {
            assertFalse("Errors found in write of " + status.getFileId(), status.hasErrors());
        }
    }

    private void assertPartitionMetadata(String[] partitionPaths, FileSystem fs) throws IOException {
        for (String partitionPath : partitionPaths) {
            assertTrue(HoodiePartitionMetadata.hasPartitionMetadata(fs, new Path(basePath, partitionPath)));
            HoodiePartitionMetadata pmeta = new HoodiePartitionMetadata(fs, new Path(basePath, partitionPath));
            pmeta.readFromFS();
            assertEquals(3, pmeta.getPartitionDepth());
        }
    }

    private void checkTaggedRecords(List<HoodieRecord> taggedRecords, String commitTime) {
        for (HoodieRecord rec : taggedRecords) {
            assertTrue("Record " + rec + " found with no location.", rec.isCurrentLocationKnown());
            assertEquals("All records should have commit time " + commitTime + ", since updates were made",
                    rec.getCurrentLocation().getCommitTime(), commitTime);
        }
    }

    @Test
    public void testFilterExist() throws Exception {
        HoodieWriteConfig config = getConfig();
        HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
        String newCommitTime = writeClient.startCommit();
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
        JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);

        HoodieReadClient readClient = new HoodieReadClient(jsc, config.getBasePath());
        JavaRDD<HoodieRecord> filteredRDD = readClient.filterExists(recordsRDD);

        // Should not find any files
        assertTrue(filteredRDD.collect().size() == 100);

        JavaRDD<HoodieRecord> smallRecordsRDD = jsc.parallelize(records.subList(0, 75), 1);
        // We create three parquet file, each having one record. (two different partitions)
        List<WriteStatus> statuses = writeClient.bulkInsert(smallRecordsRDD, newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);

        readClient = new HoodieReadClient(jsc, config.getBasePath());
        filteredRDD = readClient.filterExists(recordsRDD);
        List<HoodieRecord> result = filteredRDD.collect();
        // Check results
        assertTrue(result.size() == 25);
    }

    @Test
    public void testAutoCommit() throws Exception {
        // Set autoCommit false
        HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build();
        HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);

        String newCommitTime = "001";
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
        JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);

        JavaRDD<WriteStatus> result = client.bulkInsert(writeRecords, newCommitTime);

        assertFalse("If Autocommit is false, then commit should not be made automatically",
                HoodieTestUtils.doesCommitExist(basePath, newCommitTime));
        assertTrue("Commit should succeed", client.commit(newCommitTime, result));
        assertTrue("After explicit commit, commit file should be created",
                HoodieTestUtils.doesCommitExist(basePath, newCommitTime));

        newCommitTime = "002";
        records = dataGen.generateUpdates(newCommitTime, 100);
        JavaRDD<HoodieRecord> updateRecords = jsc.parallelize(records, 1);
        result = client.upsert(updateRecords, newCommitTime);
        assertFalse("If Autocommit is false, then commit should not be made automatically",
                HoodieTestUtils.doesCommitExist(basePath, newCommitTime));
        assertTrue("Commit should succeed", client.commit(newCommitTime, result));
        assertTrue("After explicit commit, commit file should be created",
                HoodieTestUtils.doesCommitExist(basePath, newCommitTime));
    }

    @Test
    public void testUpserts() throws Exception {
        HoodieWriteConfig cfg = getConfig();
        HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
        HoodieIndex index = HoodieIndex.createIndex(cfg, jsc);
        FileSystem fs = FSUtils.getFs();

        /**
         * Write 1 (only inserts)
         */
        String newCommitTime = "001";
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
        JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);

        List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);

        // check the partition metadata is written out
        assertPartitionMetadata(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, fs);

        // verify that there is a commit
        HoodieReadClient readClient = new HoodieReadClient(jsc, basePath, sqlContext);
        assertEquals("Expecting a single commit.", readClient.listCommitsSince("000").size(), 1);
        assertEquals("Latest commit should be 001", readClient.latestCommit(), newCommitTime);
        assertEquals("Must contain 200 records", readClient.readCommit(newCommitTime).count(), records.size());
        // Should have 100 records in table (check using Index), all in locations marked at commit
        HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
        HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());

        List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect();
        checkTaggedRecords(taggedRecords, "001");

        /**
         * Write 2 (updates)
         */
        newCommitTime = "004";
        records = dataGen.generateUpdates(newCommitTime, 100);
        LinkedHashMap<HoodieKey, HoodieRecord> recordsMap = new LinkedHashMap<>();
        for (HoodieRecord rec : records) {
            if (!recordsMap.containsKey(rec.getKey())) {
                recordsMap.put(rec.getKey(), rec);
            }
        }
        List<HoodieRecord> dedupedRecords = new ArrayList<>(recordsMap.values());

        statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);

        // verify there are now 2 commits
        readClient = new HoodieReadClient(jsc, basePath, sqlContext);
        assertEquals("Expecting two commits.", readClient.listCommitsSince("000").size(), 2);
        assertEquals("Latest commit should be 004", readClient.latestCommit(), newCommitTime);

        metaClient = new HoodieTableMetaClient(fs, basePath);
        table = HoodieTable.getHoodieTable(metaClient, getConfig());

        // Index should be able to locate all updates in correct locations.
        taggedRecords = index.tagLocation(jsc.parallelize(dedupedRecords, 1), table).collect();
        checkTaggedRecords(taggedRecords, "004");

        // Check the entire dataset has 100 records still
        String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
        for (int i = 0; i < fullPartitionPaths.length; i++) {
            fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
        }
        assertEquals("Must contain 200 records", readClient.read(fullPartitionPaths).count(), 200);

        // Check that the incremental consumption from time 000
        assertEquals("Incremental consumption from time 002, should give all records in commit 004",
                readClient.readCommit(newCommitTime).count(), readClient.readSince("002").count());
        assertEquals("Incremental consumption from time 001, should give all records in commit 004",
                readClient.readCommit(newCommitTime).count(), readClient.readSince("001").count());
    }

    @Test
    public void testDeletes() throws Exception {

        HoodieWriteConfig cfg = getConfig();
        HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
        HoodieIndex index = HoodieIndex.createIndex(cfg, jsc);
        FileSystem fs = FSUtils.getFs();

        /**
         * Write 1 (inserts and deletes)
         * Write actual 200 insert records and ignore 100 delete records
         */
        String newCommitTime = "001";
        List<HoodieRecord> fewRecordsForInsert = dataGen.generateInserts(newCommitTime, 200);
        List<HoodieRecord> fewRecordsForDelete = dataGen.generateDeletes(newCommitTime, 100);

        List<HoodieRecord> records = new ArrayList(fewRecordsForInsert);
        records.addAll(fewRecordsForDelete);

        JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);

        List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);

        // verify that there is a commit
        HoodieReadClient readClient = new HoodieReadClient(jsc, basePath, sqlContext);
        assertEquals("Expecting a single commit.", readClient.listCommitsSince("000").size(), 1);
        assertEquals("Latest commit should be 001", readClient.latestCommit(), newCommitTime);
        assertEquals("Must contain 200 records", readClient.readCommit(newCommitTime).count(),
                fewRecordsForInsert.size());
        // Should have 100 records in table (check using Index), all in locations marked at commit
        HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
        HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());

        List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(fewRecordsForInsert, 1), table)
                .collect();
        checkTaggedRecords(taggedRecords, "001");

        /**
         * Write 2 (deletes+writes)
         */
        newCommitTime = "004";
        fewRecordsForDelete = records.subList(0, 50);
        List<HoodieRecord> fewRecordsForUpdate = records.subList(50, 100);
        records = dataGen.generateDeletesFromExistingRecords(fewRecordsForDelete);

        records.addAll(fewRecordsForUpdate);

        statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);

        // verify there are now 2 commits
        readClient = new HoodieReadClient(jsc, basePath, sqlContext);
        assertEquals("Expecting two commits.", readClient.listCommitsSince("000").size(), 2);
        assertEquals("Latest commit should be 004", readClient.latestCommit(), newCommitTime);

        metaClient = new HoodieTableMetaClient(fs, basePath);
        table = HoodieTable.getHoodieTable(metaClient, getConfig());

        // Check the entire dataset has 150 records(200-50) still
        String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
        for (int i = 0; i < fullPartitionPaths.length; i++) {
            fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
        }
        assertEquals("Must contain 150 records", readClient.read(fullPartitionPaths).count(), 150);

        // Check that the incremental consumption from time 000
        assertEquals("Incremental consumption from latest commit, should give 50 updated records",
                readClient.readCommit(newCommitTime).count(), 50);
        assertEquals("Incremental consumption from time 001, should give 50 updated records", 50,
                readClient.readSince("001").count());
        assertEquals("Incremental consumption from time 000, should give 150", 150,
                readClient.readSince("000").count());
    }

    @Test
    public void testCreateSavepoint() throws Exception {
        HoodieWriteConfig cfg = getConfigBuilder()
                .withCompactionConfig(HoodieCompactionConfig.newBuilder()
                        .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1).build())
                .build();
        HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
        FileSystem fs = FSUtils.getFs();
        HoodieTestDataGenerator.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS,
                basePath);

        /**
         * Write 1 (only inserts)
         */
        String newCommitTime = "001";
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
        List<WriteStatus> statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
        assertNoWriteErrors(statuses);

        /**
         * Write 2 (updates)
         */
        newCommitTime = "002";
        records = dataGen.generateUpdates(newCommitTime, records);
        statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);

        client.savepoint("hoodie-unit-test", "test");
        try {
            client.rollback(newCommitTime);
            fail("Rollback of a savepoint was allowed " + newCommitTime);
        } catch (HoodieRollbackException e) {
            // this is good
        }

        /**
         * Write 3 (updates)
         */
        newCommitTime = "003";
        records = dataGen.generateUpdates(newCommitTime, records);
        statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);

        /**
         * Write 4 (updates)
         */
        newCommitTime = "004";
        records = dataGen.generateUpdates(newCommitTime, records);
        statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);

        List<String> partitionPaths = FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(),
                getConfig().shouldAssumeDatePartitioning());
        HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
        HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
        final TableFileSystemView.ReadOptimizedView view = table.getROFileSystemView();
        List<HoodieDataFile> dataFiles = partitionPaths.stream().flatMap(s -> {
            return view.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("002"));
        }).collect(Collectors.toList());

        assertEquals("The data files for commit 002 should not be cleaned", 3, dataFiles.size());

        // Delete savepoint
        assertFalse(table.getCompletedSavepointTimeline().empty());
        client.deleteSavepoint(
                table.getCompletedSavepointTimeline().getInstants().findFirst().get().getTimestamp());
        // rollback and reupsert 004
        client.rollback(newCommitTime);
        statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);

        metaClient = new HoodieTableMetaClient(fs, basePath);
        table = HoodieTable.getHoodieTable(metaClient, getConfig());
        final TableFileSystemView.ReadOptimizedView view1 = table.getROFileSystemView();
        dataFiles = partitionPaths.stream().flatMap(s -> {
            return view1.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("002"));
        }).collect(Collectors.toList());

        assertEquals("The data files for commit 002 should be cleaned now", 0, dataFiles.size());
    }

    @Test
    public void testRollbackToSavepoint() throws Exception {
        HoodieWriteConfig cfg = getConfigBuilder()
                .withCompactionConfig(HoodieCompactionConfig.newBuilder()
                        .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1).build())
                .build();
        HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
        FileSystem fs = FSUtils.getFs();
        HoodieTestDataGenerator.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS,
                basePath);

        /**
         * Write 1 (only inserts)
         */
        String newCommitTime = "001";
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
        JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);

        List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);

        /**
         * Write 2 (updates)
         */
        newCommitTime = "002";
        records = dataGen.generateUpdates(newCommitTime, records);
        statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);

        client.savepoint("hoodie-unit-test", "test");

        /**
         * Write 3 (updates)
         */
        newCommitTime = "003";
        records = dataGen.generateUpdates(newCommitTime, records);
        statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);
        List<String> partitionPaths = FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(),
                getConfig().shouldAssumeDatePartitioning());
        HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
        HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
        final TableFileSystemView.ReadOptimizedView view1 = table.getROFileSystemView();

        List<HoodieDataFile> dataFiles = partitionPaths.stream().flatMap(s -> {
            return view1.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("003"));
        }).collect(Collectors.toList());
        assertEquals("The data files for commit 003 should be present", 3, dataFiles.size());

        /**
         * Write 4 (updates)
         */
        newCommitTime = "004";
        records = dataGen.generateUpdates(newCommitTime, records);
        statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);

        metaClient = new HoodieTableMetaClient(fs, basePath);
        table = HoodieTable.getHoodieTable(metaClient, getConfig());
        final TableFileSystemView.ReadOptimizedView view2 = table.getROFileSystemView();

        dataFiles = partitionPaths.stream().flatMap(s -> {
            return view2.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("004"));
        }).collect(Collectors.toList());
        assertEquals("The data files for commit 004 should be present", 3, dataFiles.size());

        // rolling back to a non existent savepoint must not succeed
        try {
            client.rollbackToSavepoint("001");
            fail("Rolling back to non-existent savepoint should not be allowed");
        } catch (HoodieRollbackException e) {
            // this is good
        }

        // rollback to savepoint 002
        HoodieInstant savepoint = table.getCompletedSavepointTimeline().getInstants().findFirst().get();
        client.rollbackToSavepoint(savepoint.getTimestamp());

        metaClient = new HoodieTableMetaClient(fs, basePath);
        table = HoodieTable.getHoodieTable(metaClient, getConfig());
        final TableFileSystemView.ReadOptimizedView view3 = table.getROFileSystemView();
        dataFiles = partitionPaths.stream().flatMap(s -> {
            return view3.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("002"));
        }).collect(Collectors.toList());
        assertEquals("The data files for commit 002 be available", 3, dataFiles.size());

        dataFiles = partitionPaths.stream().flatMap(s -> {
            return view3.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("003"));
        }).collect(Collectors.toList());
        assertEquals("The data files for commit 003 should be rolled back", 0, dataFiles.size());

        dataFiles = partitionPaths.stream().flatMap(s -> {
            return view3.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("004"));
        }).collect(Collectors.toList());
        assertEquals("The data files for commit 004 should be rolled back", 0, dataFiles.size());
    }

    @Test
    public void testInsertAndCleanByVersions() throws Exception {
        int maxVersions = 2; // keep upto 2 versions for each file
        HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder()
                .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(maxVersions)
                .build()).build();
        HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
        HoodieIndex index = HoodieIndex.createIndex(cfg, jsc);
        FileSystem fs = FSUtils.getFs();

        /**
         * do a big insert
         * (this is basically same as insert part of upsert, just adding it here so we can
         * catch breakages in insert(), if the implementation diverges.)
         */
        String newCommitTime = client.startCommit();
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 500);
        JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 5);

        List<WriteStatus> statuses = client.insert(writeRecords, newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);

        // verify that there is a commit

        assertEquals("Expecting a single commit.",
                new HoodieReadClient(jsc, basePath).listCommitsSince("000").size(), 1);
        // Should have 100 records in table (check using Index), all in locations marked at commit
        HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
        HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
        assertFalse(table.getCompletedCommitTimeline().empty());
        String commitTime = table.getCompletedCommitTimeline().getInstants().findFirst().get().getTimestamp();
        assertFalse(table.getCompletedCleanTimeline().empty());
        assertEquals("The clean instant should be the same as the commit instant", commitTime,
                table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp());

        List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect();
        checkTaggedRecords(taggedRecords, newCommitTime);

        // Keep doing some writes and clean inline. Make sure we have expected number of files remaining.
        for (int writeCnt = 2; writeCnt < 10; writeCnt++) {

            Thread.sleep(1100); // make sure commits are unique
            newCommitTime = client.startCommit();
            records = dataGen.generateUpdates(newCommitTime, 100);

            statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
            // Verify there are no errors
            assertNoWriteErrors(statuses);

            HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
            table = HoodieTable.getHoodieTable(metadata, getConfig());
            HoodieTimeline timeline = table.getCommitTimeline();

            TableFileSystemView fsView = table.getFileSystemView();
            // Need to ensure the following
            for (String partitionPath : dataGen.getPartitionPaths()) {
                // compute all the versions of all files, from time 0
                HashMap<String, TreeSet<String>> fileIdToVersions = new HashMap<>();
                for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) {
                    HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
                            .fromBytes(timeline.getInstantDetails(entry).get());

                    for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) {
                        if (!fileIdToVersions.containsKey(wstat.getFileId())) {
                            fileIdToVersions.put(wstat.getFileId(), new TreeSet<>());
                        }
                        fileIdToVersions.get(wstat.getFileId())
                                .add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName()));
                    }
                }

                List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath)
                        .collect(Collectors.toList());

                for (HoodieFileGroup fileGroup : fileGroups) {
                    // No file has no more than max versions
                    String fileId = fileGroup.getId();
                    List<HoodieDataFile> dataFiles = fileGroup.getAllDataFiles().collect(Collectors.toList());

                    assertTrue("fileId " + fileId + " has more than " + maxVersions + " versions",
                            dataFiles.size() <= maxVersions);

                    // Each file, has the latest N versions (i.e cleaning gets rid of older versions)
                    List<String> commitedVersions = new ArrayList<>(fileIdToVersions.get(fileId));
                    for (int i = 0; i < dataFiles.size(); i++) {
                        assertEquals(
                                "File " + fileId + " does not have latest versions on commits" + commitedVersions,
                                Iterables.get(dataFiles, i).getCommitTime(),
                                commitedVersions.get(commitedVersions.size() - 1 - i));
                    }
                }
            }
        }
    }

    @Test
    public void testInsertAndCleanByCommits() throws Exception {
        int maxCommits = 3; // keep upto 3 commits from the past
        HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder()
                .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainCommits(maxCommits)
                .build()).build();
        HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
        HoodieIndex index = HoodieIndex.createIndex(cfg, jsc);
        FileSystem fs = FSUtils.getFs();

        /**
         * do a big insert
         * (this is basically same as insert part of upsert, just adding it here so we can
         * catch breakages in insert(), if the implementation diverges.)
         */
        String newCommitTime = client.startCommit();
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 500);
        JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 5);

        List<WriteStatus> statuses = client.insert(writeRecords, newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);

        // verify that there is a commit
        assertEquals("Expecting a single commit.",
                new HoodieReadClient(jsc, basePath).listCommitsSince("000").size(), 1);
        // Should have 100 records in table (check using Index), all in locations marked at commit
        HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
        HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());

        assertFalse(table.getCompletedCommitTimeline().empty());
        String commitTime = table.getCompletedCommitTimeline().getInstants().findFirst().get().getTimestamp();
        assertFalse(table.getCompletedCleanTimeline().empty());
        assertEquals("The clean instant should be the same as the commit instant", commitTime,
                table.getCompletedCleanTimeline().getInstants().findFirst().get().getTimestamp());

        List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table).collect();
        checkTaggedRecords(taggedRecords, newCommitTime);

        // Keep doing some writes and clean inline. Make sure we have expected number of files remaining.
        for (int writeCnt = 2; writeCnt < 10; writeCnt++) {
            Thread.sleep(1100); // make sure commits are unique
            newCommitTime = client.startCommit();
            records = dataGen.generateUpdates(newCommitTime, 100);

            statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
            // Verify there are no errors
            assertNoWriteErrors(statuses);

            HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
            HoodieTable table1 = HoodieTable.getHoodieTable(metadata, cfg);
            HoodieTimeline activeTimeline = table1.getCompletedCommitTimeline();
            Optional<HoodieInstant> earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits - 1);
            Set<HoodieInstant> acceptableCommits = activeTimeline.getInstants().collect(Collectors.toSet());
            if (earliestRetainedCommit.isPresent()) {
                acceptableCommits.removeAll(
                        activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp())
                                .getInstants().collect(Collectors.toSet()));
                acceptableCommits.add(earliestRetainedCommit.get());
            }

            TableFileSystemView fsView = table1.getFileSystemView();
            // Need to ensure the following
            for (String partitionPath : dataGen.getPartitionPaths()) {
                List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath)
                        .collect(Collectors.toList());
                for (HoodieFileGroup fileGroup : fileGroups) {
                    Set<String> commitTimes = new HashSet<>();
                    fileGroup.getAllDataFiles().forEach(value -> {
                        System.out.println("Data File - " + value);
                        commitTimes.add(value.getCommitTime());
                    });
                    assertEquals("Only contain acceptable versions of file should be present",
                            acceptableCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toSet()),
                            commitTimes);
                }
            }
        }
    }

    @Test
    public void testRollbackCommit() throws Exception {
        // Let's create some commit files and parquet files
        String commitTime1 = "20160501010101";
        String commitTime2 = "20160502020601";
        String commitTime3 = "20160506030611";
        new File(basePath + "/.hoodie").mkdirs();
        HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(),
                new String[] { "2016/05/01", "2016/05/02", "2016/05/06" }, basePath);

        // Only first two have commit files
        HoodieTestUtils.createCommitFiles(basePath, commitTime1, commitTime2);
        // Third one has a .inflight intermediate commit file
        HoodieTestUtils.createInflightCommitFiles(basePath, commitTime3);

        // Make commit1
        String file11 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime1, "id11");
        String file12 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime1, "id12");
        String file13 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime1, "id13");

        // Make commit2
        String file21 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime2, "id21");
        String file22 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime2, "id22");
        String file23 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime2, "id23");

        // Make commit3
        String file31 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime3, "id31");
        String file32 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime3, "id32");
        String file33 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime3, "id33");

        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
                .withIndexConfig(
                        HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build())
                .build();

        HoodieWriteClient client = new HoodieWriteClient(jsc, config, false);

        // Rollback commit 1 (this should fail, since commit2 is still around)
        try {
            client.rollback(commitTime1);
            assertTrue("Should have thrown an exception ", false);
        } catch (HoodieRollbackException hrbe) {
            // should get here
        }

        // Rollback commit3
        client.rollback(commitTime3);
        assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3));
        assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime3, file31)
                || HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime3, file32)
                || HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime3, file33));

        // simulate partial failure, where .inflight was not deleted, but data files were.
        HoodieTestUtils.createInflightCommitFiles(basePath, commitTime3);
        client.rollback(commitTime3);
        assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3));

        // Rollback commit2
        client.rollback(commitTime2);
        assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime2));
        assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime2));
        assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime2, file21)
                || HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22)
                || HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23));

        // simulate partial failure, where only .commit => .inflight renaming succeeded, leaving a
        // .inflight commit and a bunch of data files around.
        HoodieTestUtils.createInflightCommitFiles(basePath, commitTime2);
        file21 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime2, "id21");
        file22 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime2, "id22");
        file23 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime2, "id23");

        client.rollback(commitTime2);
        assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime2));
        assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime2));
        assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime2, file21)
                || HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22)
                || HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23));

        // Let's rollback commit1, Check results
        client.rollback(commitTime1);
        assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime1));
        assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime1));
        assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime1, file11)
                || HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime1, file12)
                || HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime1, file13));
    }

    @Test
    public void testAutoRollbackCommit() throws Exception {
        // Let's create some commit files and parquet files
        String commitTime1 = "20160501010101";
        String commitTime2 = "20160502020601";
        String commitTime3 = "20160506030611";
        new File(basePath + "/.hoodie").mkdirs();
        HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(),
                new String[] { "2016/05/01", "2016/05/02", "2016/05/06" }, basePath);

        // One good commit
        HoodieTestUtils.createCommitFiles(basePath, commitTime1);
        // Two inflight commits
        HoodieTestUtils.createInflightCommitFiles(basePath, commitTime2, commitTime3);

        // Make commit1
        String file11 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime1, "id11");
        String file12 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime1, "id12");
        String file13 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime1, "id13");

        // Make commit2
        String file21 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime2, "id21");
        String file22 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime2, "id22");
        String file23 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime2, "id23");

        // Make commit3
        String file31 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime3, "id31");
        String file32 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime3, "id32");
        String file33 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime3, "id33");

        // Turn auto rollback off
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
                .withIndexConfig(
                        HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build())
                .build();

        new HoodieWriteClient(jsc, config, false);

        // Check results, nothing changed
        assertTrue(HoodieTestUtils.doesCommitExist(basePath, commitTime1));
        assertTrue(HoodieTestUtils.doesInflightExist(basePath, commitTime2));
        assertTrue(HoodieTestUtils.doesInflightExist(basePath, commitTime3));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime3, file31)
                && HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime3, file32)
                && HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime3, file33));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime2, file21)
                && HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22)
                && HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime1, file11)
                && HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime1, file12)
                && HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime1, file13));

        // Turn auto rollback on
        new HoodieWriteClient(jsc, config, true);
        assertTrue(HoodieTestUtils.doesCommitExist(basePath, commitTime1));
        assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime2));
        assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3));
        assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime3, file31)
                || HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime3, file32)
                || HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime3, file33));
        assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime2, file21)
                || HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22)
                || HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime1, file11)
                && HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime1, file12)
                && HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime1, file13));
    }

    private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize) {
        HoodieWriteConfig.Builder builder = getConfigBuilder();
        return builder
                .withCompactionConfig(HoodieCompactionConfig.newBuilder()
                        .compactionSmallFileSize(HoodieTestDataGenerator.SIZE_PER_RECORD * 15)
                        .insertSplitSize(insertSplitSize).build()) // tolerate upto 15 records
                .withStorageConfig(HoodieStorageConfig.newBuilder()
                        .limitFileSize(HoodieTestDataGenerator.SIZE_PER_RECORD * 20).build())
                .build();
    }

    @Test
    public void testSmallInsertHandlingForUpserts() throws Exception {

        FileSystem fs = FSUtils.getFs();
        final String TEST_PARTITION_PATH = "2016/09/26";
        final int INSERT_SPLIT_LIMIT = 10;
        // setup the small file handling params
        HoodieWriteConfig config = getSmallInsertWriteConfig(INSERT_SPLIT_LIMIT); // hold upto 20 records max
        dataGen = new HoodieTestDataGenerator(new String[] { TEST_PARTITION_PATH });

        HoodieWriteClient client = new HoodieWriteClient(jsc, config);

        // Inserts => will write file1
        String commitTime1 = "001";
        List<HoodieRecord> inserts1 = dataGen.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb
        Set<String> keys1 = HoodieClientTestUtils.getRecordKeys(inserts1);

        JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
        List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime1).collect();

        assertNoWriteErrors(statuses);

        assertEquals("Just 1 file needs to be added.", 1, statuses.size());
        String file1 = statuses.get(0).getFileId();
        assertEquals("file should contain 10 records",
                ParquetUtils
                        .readRowKeysFromParquet(new Path(basePath,
                                TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1)))
                        .size(),
                10);

        // Update + Inserts such that they just expand file1
        String commitTime2 = "002";
        List<HoodieRecord> inserts2 = dataGen.generateInserts(commitTime2, 4);
        Set<String> keys2 = HoodieClientTestUtils.getRecordKeys(inserts2);
        List<HoodieRecord> insertsAndUpdates2 = new ArrayList<>();
        insertsAndUpdates2.addAll(inserts2);
        insertsAndUpdates2.addAll(dataGen.generateUpdates(commitTime2, inserts1));

        JavaRDD<HoodieRecord> insertAndUpdatesRDD2 = jsc.parallelize(insertsAndUpdates2, 1);
        statuses = client.upsert(insertAndUpdatesRDD2, commitTime2).collect();
        assertNoWriteErrors(statuses);

        assertEquals("Just 1 file needs to be updated.", 1, statuses.size());
        assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId());
        assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit());
        Path newFile = new Path(basePath,
                TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
        assertEquals("file should contain 14 records", ParquetUtils.readRowKeysFromParquet(newFile).size(), 14);

        List<GenericRecord> records = ParquetUtils.readAvroRecords(newFile);
        for (GenericRecord record : records) {
            String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
            assertEquals("only expect commit2", commitTime2,
                    record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
            assertTrue("key expected to be part of commit2",
                    keys2.contains(recordKey) || keys1.contains(recordKey));
        }

        // update + inserts such that file1 is updated and expanded, a new file2 is created.
        String commitTime3 = "003";
        List<HoodieRecord> insertsAndUpdates3 = dataGen.generateInserts(commitTime3, 20);
        Set<String> keys3 = HoodieClientTestUtils.getRecordKeys(insertsAndUpdates3);
        List<HoodieRecord> updates3 = dataGen.generateUpdates(commitTime3, inserts2);
        insertsAndUpdates3.addAll(updates3);

        JavaRDD<HoodieRecord> insertAndUpdatesRDD3 = jsc.parallelize(insertsAndUpdates3, 1);
        statuses = client.upsert(insertAndUpdatesRDD3, commitTime3).collect();
        assertNoWriteErrors(statuses);

        assertEquals("2 files needs to be committed.", 2, statuses.size());
        HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
        HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
        TableFileSystemView.ReadOptimizedView fileSystemView = table.getROFileSystemView();
        List<HoodieDataFile> files = fileSystemView.getLatestDataFilesBeforeOrOn(TEST_PARTITION_PATH, commitTime3)
                .collect(Collectors.toList());
        int numTotalInsertsInCommit3 = 0;
        for (HoodieDataFile file : files) {
            if (file.getFileName().contains(file1)) {
                assertEquals("Existing file should be expanded", commitTime3, file.getCommitTime());
                records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
                for (GenericRecord record : records) {
                    String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
                    String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
                    if (recordCommitTime.equals(commitTime3)) {
                        if (keys2.contains(recordKey)) {
                            assertEquals("only expect commit3", commitTime3, recordCommitTime);
                            keys2.remove(recordKey);
                        } else {
                            numTotalInsertsInCommit3++;
                        }
                    }
                }
                assertEquals("All keys added in commit 2 must be updated in commit3 correctly", 0, keys2.size());
            } else {
                assertEquals("New file must be written for commit 3", commitTime3, file.getCommitTime());
                records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
                for (GenericRecord record : records) {
                    String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
                    assertEquals("only expect commit3", commitTime3,
                            record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
                    assertTrue("key expected to be part of commit3", keys3.contains(recordKey));
                }
                numTotalInsertsInCommit3 += records.size();
            }
        }
        assertEquals("Total inserts in commit3 must add up", keys3.size(), numTotalInsertsInCommit3);
    }

    @Test
    public void testSmallInsertHandlingForInserts() throws Exception {

        final String TEST_PARTITION_PATH = "2016/09/26";
        final int INSERT_SPLIT_LIMIT = 10;
        // setup the small file handling params
        HoodieWriteConfig config = getSmallInsertWriteConfig(INSERT_SPLIT_LIMIT); // hold upto 20 records max
        dataGen = new HoodieTestDataGenerator(new String[] { TEST_PARTITION_PATH });
        HoodieWriteClient client = new HoodieWriteClient(jsc, config);

        // Inserts => will write file1
        String commitTime1 = "001";
        List<HoodieRecord> inserts1 = dataGen.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb
        Set<String> keys1 = HoodieClientTestUtils.getRecordKeys(inserts1);
        JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
        List<WriteStatus> statuses = client.insert(insertRecordsRDD1, commitTime1).collect();

        assertNoWriteErrors(statuses);
        assertPartitionMetadata(new String[] { TEST_PARTITION_PATH }, FSUtils.getFs());

        assertEquals("Just 1 file needs to be added.", 1, statuses.size());
        String file1 = statuses.get(0).getFileId();
        assertEquals("file should contain 10 records",
                ParquetUtils
                        .readRowKeysFromParquet(new Path(basePath,
                                TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1)))
                        .size(),
                10);

        // Second, set of Inserts should just expand file1
        String commitTime2 = "002";
        List<HoodieRecord> inserts2 = dataGen.generateInserts(commitTime2, 4);
        Set<String> keys2 = HoodieClientTestUtils.getRecordKeys(inserts2);
        JavaRDD<HoodieRecord> insertRecordsRDD2 = jsc.parallelize(inserts2, 1);
        statuses = client.insert(insertRecordsRDD2, commitTime2).collect();
        assertNoWriteErrors(statuses);

        assertEquals("Just 1 file needs to be updated.", 1, statuses.size());
        assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId());
        assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit());
        Path newFile = new Path(basePath,
                TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
        assertEquals("file should contain 14 records", ParquetUtils.readRowKeysFromParquet(newFile).size(), 14);

        List<GenericRecord> records = ParquetUtils.readAvroRecords(newFile);
        for (GenericRecord record : records) {
            String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
            String recCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
            assertTrue("Record expected to be part of commit 1 or commit2",
                    commitTime1.equals(recCommitTime) || commitTime2.equals(recCommitTime));
            assertTrue("key expected to be part of commit 1 or commit2",
                    keys2.contains(recordKey) || keys1.contains(recordKey));
        }

        // Lots of inserts such that file1 is updated and expanded, a new file2 is created.
        String commitTime3 = "003";
        List<HoodieRecord> insert3 = dataGen.generateInserts(commitTime3, 20);
        JavaRDD<HoodieRecord> insertRecordsRDD3 = jsc.parallelize(insert3, 1);
        statuses = client.insert(insertRecordsRDD3, commitTime3).collect();
        assertNoWriteErrors(statuses);
        assertEquals("2 files needs to be committed.", 2, statuses.size());

        FileSystem fs = FSUtils.getFs();
        HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
        HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
        List<HoodieDataFile> files = table.getROFileSystemView()
                .getLatestDataFilesBeforeOrOn(TEST_PARTITION_PATH, commitTime3).collect(Collectors.toList());
        assertEquals("Total of 2 valid data files", 2, files.size());

        int totalInserts = 0;
        for (HoodieDataFile file : files) {
            assertEquals("All files must be at commit 3", commitTime3, file.getCommitTime());
            records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
            totalInserts += records.size();
        }
        assertEquals("Total number of records must add up", totalInserts,
                inserts1.size() + inserts2.size() + insert3.size());
    }

    @Test
    public void testKeepLatestFileVersions() throws IOException {
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
                .withAssumeDatePartitioning(true)
                .withCompactionConfig(HoodieCompactionConfig.newBuilder()
                        .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1)
                        .build())
                .build();

        // make 1 commit, with 1 file per partition
        HoodieTestUtils.createCommitFiles(basePath, "000");

        String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000");
        String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000");
        HoodieTable table = HoodieTable
                .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);

        List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc);
        assertEquals("Must not clean any files", 0,
                getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size());
        assertEquals("Must not clean any files", 0,
                getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size());
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0));

        // make next commit, with 1 insert & 1 update per partition
        HoodieTestUtils.createCommitFiles(basePath, "001");
        table = HoodieTable.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
                config);

        String file2P0C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "001"); // insert
        String file2P1C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "001"); // insert
        HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update
        HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update

        List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
        assertEquals("Must clean 1 file", 1,
                getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size());
        assertEquals("Must clean 1 file", 1,
                getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size());
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1));
        assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
        assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0));

        // make next commit, with 2 updates to existing files, and 1 insert
        HoodieTestUtils.createCommitFiles(basePath, "002");
        table = HoodieTable.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
                config);

        HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update
        HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update
        String file3P0C2 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "002");

        List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc);
        assertEquals("Must clean two files", 2,
                getCleanStat(hoodieCleanStatsThree, partitionPaths[0]).getSuccessDeleteFiles().size());
        assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
        assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2));

        // No cleaning on partially written file, with no commit.
        HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file3P0C2); // update
        List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
        assertEquals("Must not clean any files", 0,
                getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2));
    }

    @Test
    public void testKeepLatestCommits() throws IOException {
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
                .withAssumeDatePartitioning(true)
                .withCompactionConfig(HoodieCompactionConfig.newBuilder()
                        .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build())
                .build();

        // make 1 commit, with 1 file per partition
        HoodieTestUtils.createCommitFiles(basePath, "000");

        String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000");
        String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000");

        HoodieTable table = HoodieTable
                .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);

        List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc);
        assertEquals("Must not clean any files", 0,
                getCleanStat(hoodieCleanStatsOne, partitionPaths[0]).getSuccessDeleteFiles().size());
        assertEquals("Must not clean any files", 0,
                getCleanStat(hoodieCleanStatsOne, partitionPaths[1]).getSuccessDeleteFiles().size());
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0));

        // make next commit, with 1 insert & 1 update per partition
        HoodieTestUtils.createCommitFiles(basePath, "001");
        table = HoodieTable.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
                config);

        String file2P0C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "001"); // insert
        String file2P1C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "001"); // insert
        HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update
        HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update

        List<HoodieCleanStat> hoodieCleanStatsTwo = table.clean(jsc);
        assertEquals("Must not clean any files", 0,
                getCleanStat(hoodieCleanStatsTwo, partitionPaths[0]).getSuccessDeleteFiles().size());
        assertEquals("Must not clean any files", 0,
                getCleanStat(hoodieCleanStatsTwo, partitionPaths[1]).getSuccessDeleteFiles().size());
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0));

        // make next commit, with 2 updates to existing files, and 1 insert
        HoodieTestUtils.createCommitFiles(basePath, "002");
        table = HoodieTable.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
                config);

        HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update
        HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update
        String file3P0C2 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "002");

        List<HoodieCleanStat> hoodieCleanStatsThree = table.clean(jsc);
        assertEquals("Must not clean any file. We have to keep 1 version before the latest commit time to keep", 0,
                getCleanStat(hoodieCleanStatsThree, partitionPaths[0]).getSuccessDeleteFiles().size());

        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));

        // make next commit, with 2 updates to existing files, and 1 insert
        HoodieTestUtils.createCommitFiles(basePath, "003");
        table = HoodieTable.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
                config);

        HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file1P0C0); // update
        HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file2P0C1); // update
        String file4P0C3 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "003");

        List<HoodieCleanStat> hoodieCleanStatsFour = table.clean(jsc);
        assertEquals("Must not clean one old file", 1,
                getCleanStat(hoodieCleanStatsFour, partitionPaths[0]).getSuccessDeleteFiles().size());

        assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file1P0C0));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file2P0C1));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "003", file4P0C3));

        // No cleaning on partially written file, with no commit.
        HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "004", file3P0C2); // update
        List<HoodieCleanStat> hoodieCleanStatsFive = table.clean(jsc);
        assertEquals("Must not clean any files", 0,
                getCleanStat(hoodieCleanStatsFive, partitionPaths[0]).getSuccessDeleteFiles().size());
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0));
        assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1));
    }

    @Test
    public void testCleaningWithZeroPartitonPaths() throws IOException {
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
                .withAssumeDatePartitioning(true)
                .withCompactionConfig(HoodieCompactionConfig.newBuilder()
                        .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build())
                .build();

        // Make a commit, although there are no partitionPaths.
        // Example use-case of this is when a client wants to create a table
        // with just some commit metadata, but no data/partitionPaths.
        HoodieTestUtils.createCommitFiles(basePath, "000");

        HoodieTable table = HoodieTable
                .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);

        List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc);
        assertTrue("HoodieCleanStats should be empty for a table with empty partitionPaths",
                hoodieCleanStatsOne.isEmpty());
    }

    @Test
    public void testCleaningSkewedPartitons() throws IOException {
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath)
                .withAssumeDatePartitioning(true)
                .withCompactionConfig(HoodieCompactionConfig.newBuilder()
                        .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build())
                .build();
        Map<Long, Long> stageOneShuffleReadTaskRecordsCountMap = new HashMap<>();

        // Since clean involves repartition in order to uniformly distribute data,
        // we can inspect the number of records read by various tasks in stage 1.
        // There should not be skew in the number of records read in the task.

        // SparkListener below listens to the stage end events and captures number of
        // records read by various tasks in stage-1.
        jsc.sc().addSparkListener(new SparkListener() {

            @Override
            public void onTaskEnd(SparkListenerTaskEnd taskEnd) {

                Iterator<AccumulatorV2<?, ?>> iterator = taskEnd.taskMetrics().accumulators().iterator();
                while (iterator.hasNext()) {
                    AccumulatorV2 accumulator = iterator.next();
                    if (taskEnd.stageId() == 1 && accumulator.isRegistered() && accumulator.name().isDefined()
                            && accumulator.name().get().equals("internal.metrics.shuffle.read.recordsRead")) {
                        stageOneShuffleReadTaskRecordsCountMap.put(taskEnd.taskInfo().taskId(),
                                (Long) accumulator.value());
                    }
                }
            }
        });

        // make 1 commit, with 100 files in one partition and 10 in other two
        HoodieTestUtils.createCommitFiles(basePath, "000");
        List<String> filesP0C0 = createFilesInPartition(partitionPaths[0], "000", 100);
        List<String> filesP1C0 = createFilesInPartition(partitionPaths[1], "000", 10);
        List<String> filesP2C0 = createFilesInPartition(partitionPaths[2], "000", 10);

        HoodieTestUtils.createCommitFiles(basePath, "001");
        updateAllFilesInPartition(filesP0C0, partitionPaths[0], "001");
        updateAllFilesInPartition(filesP1C0, partitionPaths[1], "001");
        updateAllFilesInPartition(filesP2C0, partitionPaths[2], "001");

        HoodieTestUtils.createCommitFiles(basePath, "002");
        updateAllFilesInPartition(filesP0C0, partitionPaths[0], "002");
        updateAllFilesInPartition(filesP1C0, partitionPaths[1], "002");
        updateAllFilesInPartition(filesP2C0, partitionPaths[2], "002");

        HoodieTestUtils.createCommitFiles(basePath, "003");
        updateAllFilesInPartition(filesP0C0, partitionPaths[0], "003");
        updateAllFilesInPartition(filesP1C0, partitionPaths[1], "003");
        updateAllFilesInPartition(filesP2C0, partitionPaths[2], "003");

        HoodieTable table = HoodieTable
                .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), config);
        List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);

        assertEquals(100, getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size());
        assertEquals(10, getCleanStat(hoodieCleanStats, partitionPaths[1]).getSuccessDeleteFiles().size());
        assertEquals(10, getCleanStat(hoodieCleanStats, partitionPaths[2]).getSuccessDeleteFiles().size());

        // 3 tasks are expected since the number of partitions is 3
        assertEquals(3, stageOneShuffleReadTaskRecordsCountMap.keySet().size());
        // Sum of all records processed = total number of files to clean
        assertEquals(120,
                stageOneShuffleReadTaskRecordsCountMap.values().stream().reduce((a, b) -> a + b).get().intValue());
        assertTrue(
                "The skew in handling files to clean is not removed. "
                        + "Each task should handle more records than the partitionPath with least files "
                        + "and less records than the partitionPath with most files.",
                stageOneShuffleReadTaskRecordsCountMap.values().stream().filter(a -> a > 10 && a < 100)
                        .count() == 3);
    }

    public void testCommitWritesRelativePaths() throws Exception {

        HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build();
        HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
        FileSystem fs = FSUtils.getFs();
        HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
        HoodieTable table = HoodieTable.getHoodieTable(metaClient, cfg);

        String commitTime = "000";
        List<HoodieRecord> records = dataGen.generateInserts(commitTime, 200);
        JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);

        JavaRDD<WriteStatus> result = client.bulkInsert(writeRecords, commitTime);

        assertTrue("Commit should succeed", client.commit(commitTime, result));
        assertTrue("After explicit commit, commit file should be created",
                HoodieTestUtils.doesCommitExist(basePath, commitTime));

        // Get parquet file paths from commit metadata
        String actionType = table.getCompactedCommitActionType();
        HoodieInstant commitInstant = new HoodieInstant(false, actionType, commitTime);
        HoodieTimeline commitTimeline = table.getCompletedCompactionCommitTimeline();
        HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
                .fromBytes(commitTimeline.getInstantDetails(commitInstant).get());
        String basePath = table.getMetaClient().getBasePath();
        Collection<String> commitPathNames = commitMetadata.getFileIdAndFullPaths(basePath).values();

        // Read from commit file
        String filename = HoodieTestUtils.getCommitFilePath(basePath, commitTime);
        FileInputStream inputStream = new FileInputStream(filename);
        String everything = IOUtils.toString(inputStream);
        HoodieCommitMetadata metadata = HoodieCommitMetadata.fromJsonString(everything.toString());
        HashMap<String, String> paths = metadata.getFileIdAndFullPaths(basePath);
        inputStream.close();

        // Compare values in both to make sure they are equal.
        for (String pathName : paths.values()) {
            assertTrue(commitPathNames.contains(pathName));
        }
    }

    private HoodieCleanStat getCleanStat(List<HoodieCleanStat> hoodieCleanStatsTwo, String partitionPath) {
        return hoodieCleanStatsTwo.stream().filter(e -> e.getPartitionPath().equals(partitionPath)).findFirst()
                .get();
    }

    private void updateAllFilesInPartition(List<String> files, String partitionPath, String commitTime)
            throws IOException {
        for (String fileId : files) {
            HoodieTestUtils.createDataFile(basePath, partitionPath, commitTime, fileId);
        }
    }

    private List<String> createFilesInPartition(String partitionPath, String commitTime, int numFiles)
            throws IOException {
        List<String> files = new ArrayList<>();
        for (int i = 0; i < numFiles; i++) {
            files.add(HoodieTestUtils.createNewDataFile(basePath, partitionPath, commitTime));
        }
        return files;
    }

    @After
    public void clean() {
        if (basePath != null) {
            new File(basePath).delete();
        }
        if (jsc != null) {
            jsc.stop();
        }
    }
}