com.uber.hoodie.index.bloom.TestHoodieBloomIndex.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.hoodie.index.bloom.TestHoodieBloomIndex.java

Source

/*
 *  Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *           http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *
 */

package com.uber.hoodie.index.bloom;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import com.google.common.base.Optional;
import com.google.common.collect.Lists;
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
import com.uber.hoodie.common.BloomFilter;
import com.uber.hoodie.common.HoodieClientTestUtils;
import com.uber.hoodie.common.TestRawTripPayload;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.HoodieAvroUtils;
import com.uber.hoodie.config.HoodieStorageConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.io.storage.HoodieParquetConfig;
import com.uber.hoodie.io.storage.HoodieParquetWriter;
import com.uber.hoodie.table.HoodieTable;
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.stream.Collectors;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import scala.Tuple2;

public class TestHoodieBloomIndex {

    private JavaSparkContext jsc = null;
    private String basePath = null;
    private transient FileSystem fs;
    private String schemaStr;
    private Schema schema;

    public TestHoodieBloomIndex() throws Exception {
    }

    @Before
    public void init() throws IOException {
        // Initialize a local spark env
        jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieBloomIndex"));
        // Create a temp folder as the base path
        TemporaryFolder folder = new TemporaryFolder();
        folder.create();
        basePath = folder.getRoot().getAbsolutePath();
        fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration());
        HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath);
        // We have some records to be tagged (two different partitions)
        schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
        schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr));
    }

    @After
    public void clean() {
        if (basePath != null) {
            new File(basePath).delete();
        }
        if (jsc != null) {
            jsc.stop();
        }
    }

    @Test
    public void testLoadUUIDsInMemory() throws IOException {
        // Create one RDD of hoodie record
        String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
        String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
        String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
        String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";

        TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
        HoodieRecord record1 = new HoodieRecord(
                new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
        TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
        HoodieRecord record2 = new HoodieRecord(
                new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
        TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
        HoodieRecord record3 = new HoodieRecord(
                new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
        TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
        HoodieRecord record4 = new HoodieRecord(
                new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);

        JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));

        // Load to memory
        Map<String, Iterable<String>> map = recordRDD
                .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())).groupByKey()
                .collectAsMap();
        assertEquals(map.size(), 2);
        List<String> list1 = Lists.newArrayList(map.get("2016/01/31"));
        List<String> list2 = Lists.newArrayList(map.get("2015/01/31"));
        assertEquals(list1.size(), 3);
        assertEquals(list2.size(), 1);
    }

    @Test
    public void testLoadInvolvedFiles() throws IOException {
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
        HoodieBloomIndex index = new HoodieBloomIndex(config);

        // Create some partitions, and put some files
        // "2016/01/21": 0 file
        // "2016/04/01": 1 file (2_0_20160401010101.parquet)
        // "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet,
        // 4_0_20150312101010.parquet)
        new File(basePath + "/2016/01/21").mkdirs();
        new File(basePath + "/2016/04/01").mkdirs();
        new File(basePath + "/2015/03/12").mkdirs();

        TestRawTripPayload rowChange1 = new TestRawTripPayload(
                "{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
        HoodieRecord record1 = new HoodieRecord(
                new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
        TestRawTripPayload rowChange2 = new TestRawTripPayload(
                "{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
        HoodieRecord record2 = new HoodieRecord(
                new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
        TestRawTripPayload rowChange3 = new TestRawTripPayload(
                "{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
        HoodieRecord record3 = new HoodieRecord(
                new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
        TestRawTripPayload rowChange4 = new TestRawTripPayload(
                "{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
        HoodieRecord record4 = new HoodieRecord(
                new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);

        writeParquetFile("2016/04/01", "2_0_20160401010101.parquet", Lists.newArrayList(), schema, null, false);
        writeParquetFile("2015/03/12", "1_0_20150312101010.parquet", Lists.newArrayList(), schema, null, false);
        writeParquetFile("2015/03/12", "3_0_20150312101010.parquet", Arrays.asList(record1), schema, null, false);
        writeParquetFile("2015/03/12", "4_0_20150312101010.parquet", Arrays.asList(record2, record3, record4),
                schema, null, false);

        List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12");
        HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
        HoodieTable table = HoodieTable.getHoodieTable(metadata, config, jsc);
        List<Tuple2<String, BloomIndexFileInfo>> filesList = index.loadInvolvedFiles(partitions, jsc, table);
        // Still 0, as no valid commit
        assertEquals(filesList.size(), 0);

        // Add some commits
        new File(basePath + "/.hoodie").mkdirs();
        new File(basePath + "/.hoodie/20160401010101.commit").createNewFile();
        new File(basePath + "/.hoodie/20150312101010.commit").createNewFile();

        table = HoodieTable.getHoodieTable(metadata, config, jsc);
        filesList = index.loadInvolvedFiles(partitions, jsc, table);
        assertEquals(filesList.size(), 4);
        // these files will not have the key ranges
        assertNull(filesList.get(0)._2().getMaxRecordKey());
        assertNull(filesList.get(0)._2().getMinRecordKey());
        assertFalse(filesList.get(1)._2().hasKeyRanges());
        assertNotNull(filesList.get(2)._2().getMaxRecordKey());
        assertNotNull(filesList.get(2)._2().getMinRecordKey());
        assertTrue(filesList.get(3)._2().hasKeyRanges());

        // no longer sorted, but should have same files.

        List<Tuple2<String, BloomIndexFileInfo>> expected = Arrays.asList(
                new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")),
                new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1_0_20150312101010.parquet")),
                new Tuple2<>("2015/03/12", new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")),
                new Tuple2<>("2015/03/12", new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003")));
        assertEquals(expected, filesList);
    }

    @Test
    public void testRangePruning() {

        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
        HoodieBloomIndex index = new HoodieBloomIndex(config);

        final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>();
        partitionToFileIndexInfo.put("2017/10/22",
                Arrays.asList(new BloomIndexFileInfo("f1"), new BloomIndexFileInfo("f2", "000", "000"),
                        new BloomIndexFileInfo("f3", "001", "003"), new BloomIndexFileInfo("f4", "002", "007"),
                        new BloomIndexFileInfo("f5", "009", "010")));

        JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc
                .parallelize(Arrays.asList(new Tuple2<>("2017/10/22", "003"), new Tuple2<>("2017/10/22", "002"),
                        new Tuple2<>("2017/10/22", "005"), new Tuple2<>("2017/10/22", "004")))
                .mapToPair(t -> t);

        List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index
                .explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect();

        assertEquals(10, comparisonKeyList.size());
        Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream()
                .collect(Collectors.groupingBy(t -> t._2()._2().getRecordKey(),
                        Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList())));

        assertEquals(4, recordKeyToFileComps.size());
        assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("002"));
        assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("003"));
        assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("004"));
        assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("005"));
    }

    @Test
    public void testCheckUUIDsAgainstOneFile() throws IOException, InterruptedException, ClassNotFoundException {

        // Create some records to use
        String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
        String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
        String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
        String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
        TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
        HoodieRecord record1 = new HoodieRecord(
                new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
        TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
        HoodieRecord record2 = new HoodieRecord(
                new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
        TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
        HoodieRecord record3 = new HoodieRecord(
                new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
        TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
        HoodieRecord record4 = new HoodieRecord(
                new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);

        // We write record1, record2 to a parquet file, but the bloom filter contains (record1,
        // record2, record3).
        BloomFilter filter = new BloomFilter(10000, 0.0000001);
        filter.add(record3.getRecordKey());
        String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema, filter, true);

        // The bloom filter contains 3 records
        assertTrue(filter.mightContain(record1.getRecordKey()));
        assertTrue(filter.mightContain(record2.getRecordKey()));
        assertTrue(filter.mightContain(record3.getRecordKey()));
        assertFalse(filter.mightContain(record4.getRecordKey()));

        // Compare with file
        List<String> uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(),
                record4.getRecordKey());

        List<String> results = HoodieBloomIndexCheckFunction.checkCandidatesAgainstFile(jsc.hadoopConfiguration(),
                uuids, new Path(basePath + "/2016/01/31/" + filename));
        assertEquals(results.size(), 2);
        assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")
                || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));
        assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")
                || results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0"));
        // TODO(vc): Need more coverage on actual filenames
        //assertTrue(results.get(0)._2().equals(filename));
        //assertTrue(results.get(1)._2().equals(filename));
    }

    @Test
    public void testTagLocationWithEmptyRDD() throws Exception {
        // We have some records to be tagged (two different partitions)
        JavaRDD<HoodieRecord> recordRDD = jsc.emptyRDD();
        // Also create the metadata and config
        HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
        HoodieTable table = HoodieTable.getHoodieTable(metadata, config, jsc);

        // Let's tag
        HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config);

        try {
            bloomIndex.tagLocation(recordRDD, jsc, table);
        } catch (IllegalArgumentException e) {
            fail("EmptyRDD should not result in IllegalArgumentException: Positive number of slices " + "required");
        }
    }

    @Test
    public void testTagLocation() throws Exception {
        // We have some records to be tagged (two different partitions)

        String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
        String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
        String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
        String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
        TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
        HoodieRecord record1 = new HoodieRecord(
                new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
        TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
        HoodieRecord record2 = new HoodieRecord(
                new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
        TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
        HoodieRecord record3 = new HoodieRecord(
                new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
        TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
        HoodieRecord record4 = new HoodieRecord(
                new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
        JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));

        // Also create the metadata and config
        HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
        HoodieTable table = HoodieTable.getHoodieTable(metadata, config, jsc);

        // Let's tag
        HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config);
        JavaRDD<HoodieRecord> taggedRecordRDD = bloomIndex.tagLocation(recordRDD, jsc, table);

        // Should not find any files
        for (HoodieRecord record : taggedRecordRDD.collect()) {
            assertTrue(!record.isCurrentLocationKnown());
        }

        // We create three parquet file, each having one record. (two different partitions)
        String filename1 = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, null, true);
        String filename2 = writeParquetFile("2016/01/31", Arrays.asList(record2), schema, null, true);
        String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true);

        // We do the tag again
        metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
        table = HoodieTable.getHoodieTable(metadata, config, jsc);

        taggedRecordRDD = bloomIndex.tagLocation(recordRDD, jsc, table);

        // Check results
        for (HoodieRecord record : taggedRecordRDD.collect()) {
            if (record.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
                assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename1)));
            } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
                assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename2)));
            } else if (record.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
                assertTrue(!record.isCurrentLocationKnown());
            } else if (record.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
                assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename3)));
            }
        }
    }

    @Test
    public void testCheckExists() throws Exception {
        // We have some records to be tagged (two different partitions)

        String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
        String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
        String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
        String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
        TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
        HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath());
        HoodieRecord record1 = new HoodieRecord(key1, rowChange1);
        TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
        HoodieKey key2 = new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath());
        HoodieRecord record2 = new HoodieRecord(key2, rowChange2);
        TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
        HoodieKey key3 = new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath());
        HoodieRecord record3 = new HoodieRecord(key3, rowChange3);
        TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
        HoodieKey key4 = new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath());
        HoodieRecord record4 = new HoodieRecord(key4, rowChange4);
        JavaRDD<HoodieKey> keysRDD = jsc.parallelize(Arrays.asList(key1, key2, key3, key4));

        // Also create the metadata and config
        HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
        HoodieTable table = HoodieTable.getHoodieTable(metadata, config, jsc);

        // Let's tag
        HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config);
        JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, jsc,
                table);

        // Should not find any files
        for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
            assertTrue(!record._2.isPresent());
        }

        // We create three parquet file, each having one record. (two different partitions)
        String filename1 = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, null, true);
        String filename2 = writeParquetFile("2016/01/31", Arrays.asList(record2), schema, null, true);
        String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true);

        // We do the tag again
        metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
        table = HoodieTable.getHoodieTable(metadata, config, jsc);
        taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, jsc, table);

        // Check results
        for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
            if (record._1.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
                assertTrue(record._2.isPresent());
                Path path1 = new Path(record._2.get());
                assertEquals(FSUtils.getFileId(filename1), FSUtils.getFileId(path1.getName()));
            } else if (record._1.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
                assertTrue(record._2.isPresent());
                Path path2 = new Path(record._2.get());
                assertEquals(FSUtils.getFileId(filename2), FSUtils.getFileId(path2.getName()));
            } else if (record._1.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
                assertTrue(!record._2.isPresent());
            } else if (record._1.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
                assertTrue(record._2.isPresent());
                Path path3 = new Path(record._2.get());
                assertEquals(FSUtils.getFileId(filename3), FSUtils.getFileId(path3.getName()));
            }
        }
    }

    @Test
    public void testBloomFilterFalseError() throws IOException, InterruptedException {
        // We have two hoodie records
        String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
        String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
                + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";

        // We write record1 to a parquet file, using a bloom filter having both records
        TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
        HoodieRecord record1 = new HoodieRecord(
                new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
        TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
        HoodieRecord record2 = new HoodieRecord(
                new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);

        BloomFilter filter = new BloomFilter(10000, 0.0000001);
        filter.add(record2.getRecordKey());
        String filename = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, filter, true);
        assertTrue(filter.mightContain(record1.getRecordKey()));
        assertTrue(filter.mightContain(record2.getRecordKey()));

        // We do the tag
        JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2));
        HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
        HoodieTable table = HoodieTable.getHoodieTable(metadata, config, jsc);

        HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config);
        JavaRDD<HoodieRecord> taggedRecordRDD = bloomIndex.tagLocation(recordRDD, jsc, table);

        // Check results
        for (HoodieRecord record : taggedRecordRDD.collect()) {
            if (record.getKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
                assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename)));
            } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
                assertFalse(record.isCurrentLocationKnown());
            }
        }
    }

    private String writeParquetFile(String partitionPath, List<HoodieRecord> records, Schema schema,
            BloomFilter filter, boolean createCommitTime) throws IOException, InterruptedException {
        Thread.sleep(1000);
        String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());
        String fileId = UUID.randomUUID().toString();
        String filename = FSUtils.makeDataFileName(commitTime, 1, fileId);

        return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime);
    }

    private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records,
            Schema schema, BloomFilter filter, boolean createCommitTime) throws IOException {

        if (filter == null) {
            filter = new BloomFilter(10000, 0.0000001);
        }
        HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema),
                schema, filter);
        String commitTime = FSUtils.getCommitTime(filename);
        HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
                ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
                HoodieTestUtils.getDefaultHadoopConf(),
                Double.valueOf(HoodieStorageConfig.DEFAULT_STREAM_COMPRESSION_RATIO));
        HoodieParquetWriter writer = new HoodieParquetWriter(commitTime,
                new Path(basePath + "/" + partitionPath + "/" + filename), config, schema);
        int seqId = 1;
        for (HoodieRecord record : records) {
            GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
            HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++);
            HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(),
                    filename);
            writer.writeAvro(record.getRecordKey(), avroRecord);
            filter.add(record.getRecordKey());
        }
        writer.close();

        if (createCommitTime) {
            // Also make sure the commit is valid
            new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs();
            new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit")
                    .createNewFile();
        }
        return filename;
    }
}