Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.uber.hoodie.common.table.log; import static com.uber.hoodie.common.util.SchemaTestUtil.getSimpleSchema; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import com.google.common.collect.Maps; import com.uber.hoodie.common.minicluster.MiniClusterUtil; import com.uber.hoodie.common.model.HoodieArchivedLogFile; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodieLogFile; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieTableType; import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.table.log.HoodieLogFormat.Reader; import com.uber.hoodie.common.table.log.HoodieLogFormat.Writer; import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock; import com.uber.hoodie.common.table.log.block.HoodieCommandBlock; import com.uber.hoodie.common.table.log.block.HoodieCorruptBlock; import com.uber.hoodie.common.table.log.block.HoodieDeleteBlock; import com.uber.hoodie.common.table.log.block.HoodieLogBlock; import com.uber.hoodie.common.table.log.block.HoodieLogBlock.HeaderMetadataType; import com.uber.hoodie.common.table.log.block.HoodieLogBlock.HoodieLogBlockType; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.HoodieAvroUtils; import com.uber.hoodie.common.util.SchemaTestUtil; import com.uber.hoodie.exception.CorruptedLogFileException; import java.io.IOException; import java.io.UncheckedIOException; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.junit.After; import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import org.junit.rules.TemporaryFolder; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; @SuppressWarnings("Duplicates") @RunWith(Parameterized.class) public class HoodieLogFormatTest { private static final String BASE_OUTPUT_PATH = "/tmp/"; private static String basePath; private FileSystem fs; private Path partitionPath; private int bufferSize = 4096; private Boolean readBlocksLazily = true; public HoodieLogFormatTest(Boolean readBlocksLazily) { this.readBlocksLazily = readBlocksLazily; } @Parameterized.Parameters(name = "LogBlockReadMode") public static Collection<Boolean[]> data() { return Arrays.asList(new Boolean[][] { { true }, { false } }); } @BeforeClass public static void setUpClass() throws IOException, InterruptedException { // Append is not supported in LocalFileSystem. HDFS needs to be setup. MiniClusterUtil.setUp(); } @AfterClass public static void tearDownClass() { MiniClusterUtil.shutdown(); } @Before public void setUp() throws IOException, InterruptedException { this.fs = MiniClusterUtil.fileSystem; TemporaryFolder folder = new TemporaryFolder(); folder.create(); assertTrue(fs.mkdirs(new Path(folder.getRoot().getPath()))); this.partitionPath = new Path(folder.getRoot().getPath()); this.basePath = folder.getRoot().getParent(); HoodieTestUtils.initTableType(MiniClusterUtil.configuration, basePath, HoodieTableType.MERGE_ON_READ); } @After public void tearDown() throws IOException { fs.delete(partitionPath, true); } @Test public void testEmptyLog() throws IOException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); assertEquals("Just created this log, size should be 0", 0, writer.getCurrentSize()); assertTrue("Check all log files should start with a .", writer.getLogFile().getFileName().startsWith(".")); assertEquals("Version should be 1 for new log created", 1, writer.getLogFile().getLogVersion()); } @Test public void testBasicAppend() throws IOException, InterruptedException, URISyntaxException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size = writer.getCurrentSize(); assertTrue("We just wrote a block - size should be > 0", size > 0); assertEquals("Write should be auto-flushed. The size reported by FileStatus and the writer should match", size, fs.getFileStatus(writer.getLogFile().getPath()).getLen()); writer.close(); } @Test public void testRollover() throws IOException, InterruptedException, URISyntaxException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); // Write out a block writer = writer.appendBlock(dataBlock); // Get the size of the block long size = writer.getCurrentSize(); writer.close(); // Create a writer with the size threshold as the size we just wrote - so this has to roll writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).withSizeThreshold(size - 1).build(); records = SchemaTestUtil.generateTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); assertEquals("This should be a new log file and hence size should be 0", 0, writer.getCurrentSize()); assertEquals("Version should be rolled to 2", 2, writer.getLogFile().getLogVersion()); writer.close(); } @Test public void testConcurrentAppendOnExistingLogFileWithoutWriteToken() throws Exception { testConcurrentAppend(true, false); } @Test public void testConcurrentAppendOnExistingLogFileWithWriteToken() throws Exception { testConcurrentAppend(true, true); } @Test public void testConcurrentAppendOnFirstLogFileVersion() throws Exception { testConcurrentAppend(false, true); } private void testConcurrentAppend(boolean logFileExists, boolean newLogFileFormat) throws Exception { HoodieLogFormat.WriterBuilder builder1 = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs); HoodieLogFormat.WriterBuilder builder2 = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs); if (newLogFileFormat && logFileExists) { // Assume there is an existing log-file with write token builder1 = builder1.withLogVersion(1).withLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN) .withRolloverLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN); builder2 = builder2.withLogVersion(1).withLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN) .withRolloverLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN); } else if (newLogFileFormat) { // First log file of the file-slice builder1 = builder1.withLogVersion(HoodieLogFile.LOGFILE_BASE_VERSION) .withLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN) .withRolloverLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN); builder2 = builder2.withLogVersion(HoodieLogFile.LOGFILE_BASE_VERSION) .withLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN) .withRolloverLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN); } else { builder1 = builder1.withLogVersion(1).withRolloverLogWriteToken(HoodieLogFormat.UNKNOWN_WRITE_TOKEN); } Writer writer = builder1.build(); Writer writer2 = builder2.build(); HoodieLogFile logFile1 = writer.getLogFile(); HoodieLogFile logFile2 = writer2.getLogFile(); List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); writer2 = writer2.appendBlock(dataBlock); writer.close(); writer2.close(); assertNotNull(logFile1.getLogWriteToken()); assertEquals("Log Files must have different versions", logFile1.getLogVersion(), logFile2.getLogVersion() - 1); } @Test public void testMultipleAppend() throws IOException, URISyntaxException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size1 = writer.getCurrentSize(); writer.close(); writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size2 = writer.getCurrentSize(); assertTrue("We just wrote a new block - size2 should be > size1", size2 > size1); assertEquals("Write should be auto-flushed. The size reported by FileStatus and the writer should match", size2, fs.getFileStatus(writer.getLogFile().getPath()).getLen()); writer.close(); // Close and Open again and append 100 more records writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size3 = writer.getCurrentSize(); assertTrue("We just wrote a new block - size3 should be > size2", size3 > size2); assertEquals("Write should be auto-flushed. The size reported by FileStatus and the writer should match", size3, fs.getFileStatus(writer.getLogFile().getPath()).getLen()); writer.close(); // Cannot get the current size after closing the log try { writer.getCurrentSize(); fail("getCurrentSize should fail after the logAppender is closed"); } catch (IllegalStateException e) { // pass } } /** * This is actually a test on concurrent append and not recovery lease. * Commenting this out. * https://issues.apache.org/jira/browse/HUDI-117 */ /** @Test public void testLeaseRecovery() throws IOException, URISyntaxException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") .overBaseCommit("100").withFs(fs).build(); List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size1 = writer.getCurrentSize(); // do not close this writer - this simulates a data note appending to a log dying without closing the file // writer.close(); writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size2 = writer.getCurrentSize(); assertTrue("We just wrote a new block - size2 should be > size1", size2 > size1); assertEquals("Write should be auto-flushed. The size reported by FileStatus and the writer should match", size2, fs.getFileStatus(writer.getLogFile().getPath()).getLen()); writer.close(); } **/ @Test public void testAppendNotSupported() throws IOException, URISyntaxException, InterruptedException { // Use some fs like LocalFileSystem, that does not support appends Path localPartitionPath = new Path("file://" + partitionPath); FileSystem localFs = FSUtils.getFs(localPartitionPath.toString(), HoodieTestUtils.getDefaultHadoopConf()); Path testPath = new Path(localPartitionPath, "append_test"); localFs.mkdirs(testPath); // Some data & append two times. List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); for (int i = 0; i < 2; i++) { HoodieLogFormat.newWriterBuilder().onParentPath(testPath) .withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFileId("commits.archive") .overBaseCommit("").withFs(localFs).build().appendBlock(dataBlock).close(); } // ensure there are two log file versions, with same data. FileStatus[] statuses = localFs.listStatus(testPath); assertEquals(2, statuses.length); } @SuppressWarnings("unchecked") @Test public void testBasicWriteAndScan() throws IOException, URISyntaxException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); Schema schema = getSimpleSchema(); List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100); List<IndexedRecord> copyOfRecords = records.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); writer.close(); Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema()); assertTrue("We wrote a block, we should be able to read it", reader.hasNext()); HoodieLogBlock nextBlock = reader.next(); assertEquals("The next block should be a data block", HoodieLogBlockType.AVRO_DATA_BLOCK, nextBlock.getBlockType()); HoodieAvroDataBlock dataBlockRead = (HoodieAvroDataBlock) nextBlock; assertEquals("Read records size should be equal to the written records size", copyOfRecords.size(), dataBlockRead.getRecords().size()); assertEquals("Both records lists should be the same. (ordering guaranteed)", copyOfRecords, dataBlockRead.getRecords()); reader.close(); } @SuppressWarnings("unchecked") @Test public void testBasicAppendAndRead() throws IOException, URISyntaxException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); List<IndexedRecord> records1 = SchemaTestUtil.generateTestRecords(0, 100); Schema schema = getSimpleSchema(); List<IndexedRecord> copyOfRecords1 = records1.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); writer = writer.appendBlock(dataBlock); writer.close(); writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100); List<IndexedRecord> copyOfRecords2 = records2.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new HoodieAvroDataBlock(records2, header); writer = writer.appendBlock(dataBlock); writer.close(); // Close and Open again and append 100 more records writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100); List<IndexedRecord> copyOfRecords3 = records3.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new HoodieAvroDataBlock(records3, header); writer = writer.appendBlock(dataBlock); writer.close(); Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema()); assertTrue("First block should be available", reader.hasNext()); HoodieLogBlock nextBlock = reader.next(); HoodieAvroDataBlock dataBlockRead = (HoodieAvroDataBlock) nextBlock; assertEquals("Read records size should be equal to the written records size", copyOfRecords1.size(), dataBlockRead.getRecords().size()); assertEquals("Both records lists should be the same. (ordering guaranteed)", copyOfRecords1, dataBlockRead.getRecords()); reader.hasNext(); nextBlock = reader.next(); dataBlockRead = (HoodieAvroDataBlock) nextBlock; assertEquals("Read records size should be equal to the written records size", copyOfRecords2.size(), dataBlockRead.getRecords().size()); assertEquals("Both records lists should be the same. (ordering guaranteed)", copyOfRecords2, dataBlockRead.getRecords()); reader.hasNext(); nextBlock = reader.next(); dataBlockRead = (HoodieAvroDataBlock) nextBlock; assertEquals("Read records size should be equal to the written records size", copyOfRecords3.size(), dataBlockRead.getRecords().size()); assertEquals("Both records lists should be the same. (ordering guaranteed)", copyOfRecords3, dataBlockRead.getRecords()); reader.close(); } @SuppressWarnings("unchecked") @Test public void testBasicAppendAndScanMultipleFiles() throws IOException, URISyntaxException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withSizeThreshold(1024).withFileId("test-fileid1") .overBaseCommit("100").withFs(fs).build(); Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); Set<HoodieLogFile> logFiles = new HashSet<>(); List<List<IndexedRecord>> allRecords = new ArrayList<>(); // create 4 log files while (writer.getLogFile().getLogVersion() != 4) { logFiles.add(writer.getLogFile()); List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords1 = records1.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); allRecords.add(copyOfRecords1); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); writer = writer.appendBlock(dataBlock); } writer.close(); // scan all log blocks (across multiple log files) HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, logFiles.stream().map(logFile -> logFile.getPath().toString()).collect(Collectors.toList()), schema, "100", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); List<IndexedRecord> scannedRecords = new ArrayList<>(); for (HoodieRecord record : scanner) { scannedRecords.add((IndexedRecord) record.getData().getInsertValue(schema).get()); } assertEquals("Scanner records count should be the same as appended records", scannedRecords.size(), allRecords.stream().flatMap(records -> records.stream()).collect(Collectors.toList()).size()); } @Test public void testAppendAndReadOnCorruptedLog() throws IOException, URISyntaxException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); writer.close(); // Append some arbit byte[] to thee end of the log (mimics a partially written commit) fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf()); FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath()); // create a block with outputStream.write(HoodieLogFormat.MAGIC); // Write out a length that does not confirm with the content outputStream.writeLong(1000); outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal()); outputStream.writeInt(HoodieLogFormat.currentVersion); // Write out a length that does not confirm with the content outputStream.writeLong(500); // Write out some bytes outputStream.write("something-random".getBytes()); outputStream.flush(); outputStream.close(); // First round of reads - we should be able to read the first block and then EOF Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema()); assertTrue("First block should be available", reader.hasNext()); reader.next(); assertTrue("We should have corrupted block next", reader.hasNext()); HoodieLogBlock block = reader.next(); assertEquals("The read block should be a corrupt block", HoodieLogBlockType.CORRUPT_BLOCK, block.getBlockType()); HoodieCorruptBlock corruptBlock = (HoodieCorruptBlock) block; //assertEquals("", "something-random", new String(corruptBlock.getCorruptedBytes())); assertFalse("There should be no more block left", reader.hasNext()); reader.close(); // Simulate another failure back to back outputStream = fs.append(writer.getLogFile().getPath()); // create a block with outputStream.write(HoodieLogFormat.MAGIC); // Write out a length that does not confirm with the content outputStream.writeLong(1000); outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal()); outputStream.writeInt(HoodieLogFormat.currentVersion); // Write out a length that does not confirm with the content outputStream.writeLong(500); // Write out some bytes outputStream.write("something-else-random".getBytes()); outputStream.flush(); outputStream.close(); // Should be able to append a new block writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); writer.close(); // Second round of reads - we should be able to read the first and last block reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema()); assertTrue("First block should be available", reader.hasNext()); reader.next(); assertTrue("We should get the 1st corrupted block next", reader.hasNext()); reader.next(); assertTrue("We should get the 2nd corrupted block next", reader.hasNext()); block = reader.next(); assertEquals("The read block should be a corrupt block", HoodieLogBlockType.CORRUPT_BLOCK, block.getBlockType()); corruptBlock = (HoodieCorruptBlock) block; //assertEquals("", "something-else-random", new String(corruptBlock.getCorruptedBytes())); assertTrue("We should get the last block next", reader.hasNext()); reader.next(); assertFalse("We should have no more blocks left", reader.hasNext()); reader.close(); } @Test public void testAvroLogRecordReaderBasic() throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).withSizeThreshold(500).build(); // Write 1 List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords1 = records1.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); writer = writer.appendBlock(dataBlock); // Write 2 List<IndexedRecord> records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords2 = records2.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); dataBlock = new HoodieAvroDataBlock(records2, header); writer = writer.appendBlock(dataBlock); writer.close(); List<String> allLogFiles = FSUtils .getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") .map(s -> s.getPath().toString()).collect(Collectors.toList()); HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "100", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("", 200, scanner.getTotalLogRecords()); Set<String> readKeys = new HashSet<>(200); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals("Stream collect should return all 200 records", 200, readKeys.size()); copyOfRecords1.addAll(copyOfRecords2); Set<String> originalKeys = copyOfRecords1.stream() .map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) .collect(Collectors.toSet()); assertEquals("CompositeAvroLogReader should return 200 records from 2 versions", originalKeys, readKeys); } @Test public void testAvroLogRecordReaderWithRollbackTombstone() throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); // Write 1 List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords1 = records1.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); writer = writer.appendBlock(dataBlock); // Write 2 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101"); List<IndexedRecord> records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); dataBlock = new HoodieAvroDataBlock(records2, header); writer = writer.appendBlock(dataBlock); // Rollback the last write header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "101"); header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); writer = writer.appendBlock(commandBlock); // Write 3 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "102"); List<IndexedRecord> records3 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords3 = records3.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); dataBlock = new HoodieAvroDataBlock(records3, header); writer = writer.appendBlock(dataBlock); writer.close(); List<String> allLogFiles = FSUtils .getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") .map(s -> s.getPath().toString()).collect(Collectors.toList()); HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "102", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We read 200 records from 2 write batches", 200, scanner.getTotalLogRecords()); Set<String> readKeys = new HashSet<>(200); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals("Stream collect should return all 200 records", 200, readKeys.size()); copyOfRecords1.addAll(copyOfRecords3); Set<String> originalKeys = copyOfRecords1.stream() .map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) .collect(Collectors.toSet()); assertEquals("CompositeAvroLogReader should return 200 records from 2 versions", originalKeys, readKeys); } @Test public void testAvroLogRecordReaderWithRollbackPartialBlock() throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); // Write 1 List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords1 = records1.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); writer = writer.appendBlock(dataBlock); writer.close(); // Write 2 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101"); // Append some arbit byte[] to thee end of the log (mimics a partially written commit) fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf()); FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath()); // create a block with outputStream.write(HoodieLogFormat.MAGIC); // Write out a length that does not confirm with the content outputStream.writeLong(1000); outputStream.writeInt(HoodieLogFormat.currentVersion); outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal()); // Write out some header outputStream.write(HoodieLogBlock.getLogMetadataBytes(header)); outputStream.writeLong("something-random".getBytes().length); outputStream.write("something-random".getBytes()); outputStream.flush(); outputStream.close(); // Rollback the last write header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "102"); header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "101"); header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); writer = writer.appendBlock(commandBlock); // Write 3 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "103"); List<IndexedRecord> records3 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords3 = records3.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); dataBlock = new HoodieAvroDataBlock(records3, header); writer = writer.appendBlock(dataBlock); writer.close(); List<String> allLogFiles = FSUtils .getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") .map(s -> s.getPath().toString()).collect(Collectors.toList()); HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "103", 10240L, true, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We would read 200 records", 200, scanner.getTotalLogRecords()); Set<String> readKeys = new HashSet<>(200); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals("Stream collect should return all 200 records", 200, readKeys.size()); copyOfRecords1.addAll(copyOfRecords3); Set<String> originalKeys = copyOfRecords1.stream() .map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) .collect(Collectors.toSet()); assertEquals("CompositeAvroLogReader should return 200 records from 2 versions", originalKeys, readKeys); } @Test public void testAvroLogRecordReaderWithDeleteAndRollback() throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); // Write 1 List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords1 = records1.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); writer = writer.appendBlock(dataBlock); // Write 2 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101"); List<IndexedRecord> records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords2 = records2.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); dataBlock = new HoodieAvroDataBlock(records2, header); writer = writer.appendBlock(dataBlock); copyOfRecords1.addAll(copyOfRecords2); List<String> originalKeys = copyOfRecords1.stream() .map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) .collect(Collectors.toList()); // Delete 50 keys List<HoodieKey> deletedKeys = copyOfRecords1.stream() .map(s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), ((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString()))) .collect(Collectors.toList()).subList(0, 50); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "102"); HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new HoodieKey[50]), header); writer = writer.appendBlock(deleteBlock); List<String> allLogFiles = FSUtils .getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") .map(s -> s.getPath().toString()).collect(Collectors.toList()); HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "102", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We still would read 200 records", 200, scanner.getTotalLogRecords()); final List<String> readKeys = new ArrayList<>(200); final List<Boolean> emptyPayloads = new ArrayList<>(); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); scanner.forEach(s -> { try { if (!s.getData().getInsertValue(schema).isPresent()) { emptyPayloads.add(true); } } catch (IOException io) { throw new UncheckedIOException(io); } }); assertEquals("Stream collect should return all 200 records", 200, readKeys.size()); assertEquals("Stream collect should return all 50 records with empty payloads", 50, emptyPayloads.size()); originalKeys.removeAll(deletedKeys); Collections.sort(originalKeys); Collections.sort(readKeys); assertEquals("CompositeAvroLogReader should return 150 records from 2 versions", originalKeys, readKeys); // Rollback the last block header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "103"); header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "102"); header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); writer = writer.appendBlock(commandBlock); readKeys.clear(); scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "101", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals("Stream collect should return all 200 records after rollback of delete", 200, readKeys.size()); } @Test public void testAvroLogRecordReaderWithFailedRollbacks() throws IOException, URISyntaxException, InterruptedException { // Write a Data block and Delete block with same InstantTime (written in same batch) Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); // Write 1 List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords1 = records1.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); writer = writer.appendBlock(dataBlock); // Write 2 List<IndexedRecord> records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); dataBlock = new HoodieAvroDataBlock(records2, header); writer = writer.appendBlock(dataBlock); List<String> originalKeys = copyOfRecords1.stream() .map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) .collect(Collectors.toList()); // Delete 50 keys // Delete 50 keys List<HoodieKey> deletedKeys = copyOfRecords1.stream() .map(s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), ((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString()))) .collect(Collectors.toList()).subList(0, 50); HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new HoodieKey[50]), header); writer = writer.appendBlock(deleteBlock); // Attempt 1 : Write rollback block for a failed write header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); try { writer = writer.appendBlock(commandBlock); // Say job failed, retry writing 2 rollback in the next rollback(..) attempt throw new Exception("simulating failure"); } catch (Exception e) { // it's okay } // Attempt 2 : Write another rollback blocks for a failed write writer = writer.appendBlock(commandBlock); List<String> allLogFiles = FSUtils .getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") .map(s -> s.getPath().toString()).collect(Collectors.toList()); // all data must be rolled back before merge HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "100", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We would have scanned 0 records because of rollback", 0, scanner.getTotalLogRecords()); final List<String> readKeys = new ArrayList<>(); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals("Stream collect should return all 0 records", 0, readKeys.size()); } @Test public void testAvroLogRecordReaderWithInsertDeleteAndRollback() throws IOException, URISyntaxException, InterruptedException { // Write a Data block and Delete block with same InstantTime (written in same batch) Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); // Write 1 List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords1 = records1.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); writer = writer.appendBlock(dataBlock); List<String> originalKeys = copyOfRecords1.stream() .map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) .collect(Collectors.toList()); // Delete 50 keys // Delete 50 keys List<HoodieKey> deletedKeys = copyOfRecords1.stream() .map(s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), ((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString()))) .collect(Collectors.toList()).subList(0, 50); HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new HoodieKey[50]), header); writer = writer.appendBlock(deleteBlock); // Write 2 rollback blocks (1 data block + 1 delete bloc) for a failed write header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); writer = writer.appendBlock(commandBlock); writer = writer.appendBlock(commandBlock); List<String> allLogFiles = FSUtils .getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") .map(s -> s.getPath().toString()).collect(Collectors.toList()); HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "100", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We would read 0 records", 0, scanner.getTotalLogRecords()); } @Test public void testAvroLogRecordReaderWithInvalidRollback() throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); // Write 1 List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); writer = writer.appendBlock(dataBlock); // Write invalid rollback for a failed write (possible for in-flight commits) header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "101"); header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); writer = writer.appendBlock(commandBlock); List<String> allLogFiles = FSUtils .getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") .map(s -> s.getPath().toString()).collect(Collectors.toList()); HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "100", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We still would read 100 records", 100, scanner.getTotalLogRecords()); final List<String> readKeys = new ArrayList<>(100); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals("Stream collect should return all 150 records", 100, readKeys.size()); } @Test public void testAvroLogRecordReaderWithInsertsDeleteAndRollback() throws IOException, URISyntaxException, InterruptedException { // Write a 3 Data blocs with same InstantTime (written in same batch) Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); // Write 1 List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords1 = records1.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); writer = writer.appendBlock(dataBlock); writer = writer.appendBlock(dataBlock); writer = writer.appendBlock(dataBlock); List<String> originalKeys = copyOfRecords1.stream() .map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) .collect(Collectors.toList()); // Delete 50 keys // Delete 50 keys List<HoodieKey> deletedKeys = copyOfRecords1.stream() .map(s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), ((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString()))) .collect(Collectors.toList()).subList(0, 50); HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new HoodieKey[50]), header); writer = writer.appendBlock(deleteBlock); // Write 1 rollback block for a failed write header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101"); header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); writer = writer.appendBlock(commandBlock); List<String> allLogFiles = FSUtils .getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") .map(s -> s.getPath().toString()).collect(Collectors.toList()); HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "101", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We would read 0 records", 0, scanner.getTotalLogRecords()); } @Test public void testAvroLogRecordReaderWithMixedInsertsCorruptsAndRollback() throws IOException, URISyntaxException, InterruptedException { // Write a 3 Data blocs with same InstantTime (written in same batch) Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); // Write 1 List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); writer = writer.appendBlock(dataBlock); writer = writer.appendBlock(dataBlock); writer = writer.appendBlock(dataBlock); writer.close(); // Append some arbit byte[] to the end of the log (mimics a partially written commit) fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf()); FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath()); // create a block with outputStream.write(HoodieLogFormat.MAGIC); outputStream.writeLong(1000); outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal()); outputStream.writeInt(HoodieLogFormat.currentVersion); // Write out a length that does not confirm with the content outputStream.writeLong(100); outputStream.flush(); outputStream.close(); // Append some arbit byte[] to the end of the log (mimics a partially written commit) fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf()); outputStream = fs.append(writer.getLogFile().getPath()); // create a block with outputStream.write(HoodieLogFormat.MAGIC); outputStream.writeLong(1000); outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal()); outputStream.writeInt(HoodieLogFormat.currentVersion); // Write out a length that does not confirm with the content outputStream.writeLong(100); outputStream.flush(); outputStream.close(); writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); writer = writer.appendBlock(dataBlock); writer.close(); // Append some arbit byte[] to the end of the log (mimics a partially written commit) fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf()); outputStream = fs.append(writer.getLogFile().getPath()); // create a block with outputStream.write(HoodieLogFormat.MAGIC); outputStream.writeLong(1000); outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal()); outputStream.writeInt(HoodieLogFormat.currentVersion); // Write out a length that does not confirm with the content outputStream.writeLong(100); outputStream.flush(); outputStream.close(); writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); // Write 1 rollback block for the last commit instant header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101"); header.put(HeaderMetadataType.TARGET_INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); writer = writer.appendBlock(commandBlock); writer.close(); List<String> allLogFiles = FSUtils .getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") .map(s -> s.getPath().toString()).collect(Collectors.toList()); HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "101", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We would read 0 records", 0, scanner.getTotalLogRecords()); } @SuppressWarnings("unchecked") @Test public void testBasicAppendAndReadInReverse() throws IOException, URISyntaxException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); Schema schema = getSimpleSchema(); List<IndexedRecord> records1 = SchemaTestUtil.generateTestRecords(0, 100); List<IndexedRecord> copyOfRecords1 = records1.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); writer = writer.appendBlock(dataBlock); writer.close(); writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100); List<IndexedRecord> copyOfRecords2 = records2.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); dataBlock = new HoodieAvroDataBlock(records2, header); writer = writer.appendBlock(dataBlock); writer.close(); // Close and Open again and append 100 more records writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100); List<IndexedRecord> copyOfRecords3 = records3.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); dataBlock = new HoodieAvroDataBlock(records3, header); writer = writer.appendBlock(dataBlock); writer.close(); HoodieLogFileReader reader = new HoodieLogFileReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(), bufferSize, readBlocksLazily, true); assertTrue("Last block should be available", reader.hasPrev()); HoodieLogBlock prevBlock = reader.prev(); HoodieAvroDataBlock dataBlockRead = (HoodieAvroDataBlock) prevBlock; assertEquals("Third records size should be equal to the written records size", copyOfRecords3.size(), dataBlockRead.getRecords().size()); assertEquals("Both records lists should be the same. (ordering guaranteed)", copyOfRecords3, dataBlockRead.getRecords()); assertTrue("Second block should be available", reader.hasPrev()); prevBlock = reader.prev(); dataBlockRead = (HoodieAvroDataBlock) prevBlock; assertEquals("Read records size should be equal to the written records size", copyOfRecords2.size(), dataBlockRead.getRecords().size()); assertEquals("Both records lists should be the same. (ordering guaranteed)", copyOfRecords2, dataBlockRead.getRecords()); assertTrue("First block should be available", reader.hasPrev()); prevBlock = reader.prev(); dataBlockRead = (HoodieAvroDataBlock) prevBlock; assertEquals("Read records size should be equal to the written records size", copyOfRecords1.size(), dataBlockRead.getRecords().size()); assertEquals("Both records lists should be the same. (ordering guaranteed)", copyOfRecords1, dataBlockRead.getRecords()); assertFalse(reader.hasPrev()); reader.close(); } @Test public void testAppendAndReadOnCorruptedLogInReverse() throws IOException, URISyntaxException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); Schema schema = getSimpleSchema(); List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); writer.close(); // Append some arbit byte[] to thee end of the log (mimics a partially written commit) fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf()); FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath()); // create a block with outputStream.write(HoodieLogFormat.MAGIC); outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal()); // Write out a length that does not confirm with the content outputStream.writeInt(1000); // Write out footer length outputStream.writeInt(1); // Write out some metadata // TODO : test for failure to write metadata - NA ? outputStream.write(HoodieLogBlock.getLogMetadataBytes(header)); outputStream.write("something-random".getBytes()); outputStream.flush(); outputStream.close(); // Should be able to append a new block writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); writer.close(); // First round of reads - we should be able to read the first block and then EOF HoodieLogFileReader reader = new HoodieLogFileReader(fs, writer.getLogFile(), schema, bufferSize, readBlocksLazily, true); assertTrue("Last block should be available", reader.hasPrev()); HoodieLogBlock block = reader.prev(); assertTrue("Last block should be datablock", block instanceof HoodieAvroDataBlock); assertTrue("Last block should be available", reader.hasPrev()); try { reader.prev(); } catch (CorruptedLogFileException e) { e.printStackTrace(); // We should have corrupted block } reader.close(); } @SuppressWarnings("unchecked") @Test public void testBasicAppendAndTraverseInReverse() throws IOException, URISyntaxException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); Schema schema = getSimpleSchema(); List<IndexedRecord> records1 = SchemaTestUtil.generateTestRecords(0, 100); List<IndexedRecord> copyOfRecords1 = records1.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); writer = writer.appendBlock(dataBlock); writer.close(); writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100); List<IndexedRecord> copyOfRecords2 = records2.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); dataBlock = new HoodieAvroDataBlock(records2, header); writer = writer.appendBlock(dataBlock); writer.close(); // Close and Open again and append 100 more records writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") .withFs(fs).build(); List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100); List<IndexedRecord> copyOfRecords3 = records3.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)) .collect(Collectors.toList()); dataBlock = new HoodieAvroDataBlock(records3, header); writer = writer.appendBlock(dataBlock); writer.close(); HoodieLogFileReader reader = new HoodieLogFileReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(), bufferSize, readBlocksLazily, true); assertTrue("Third block should be available", reader.hasPrev()); reader.moveToPrev(); assertTrue("Second block should be available", reader.hasPrev()); reader.moveToPrev(); // After moving twice, this last reader.prev() should read the First block written assertTrue("First block should be available", reader.hasPrev()); HoodieLogBlock prevBlock = reader.prev(); HoodieAvroDataBlock dataBlockRead = (HoodieAvroDataBlock) prevBlock; assertEquals("Read records size should be equal to the written records size", copyOfRecords1.size(), dataBlockRead.getRecords().size()); assertEquals("Both records lists should be the same. (ordering guaranteed)", copyOfRecords1, dataBlockRead.getRecords()); assertFalse(reader.hasPrev()); reader.close(); } }