Java tutorial
/* * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.uber.hoodie.hive; import static com.uber.hoodie.common.model.HoodieTestUtils.DEFAULT_TASK_PARTITIONID; import static org.junit.Assert.fail; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.uber.hoodie.avro.HoodieAvroWriteSupport; import com.uber.hoodie.common.BloomFilter; import com.uber.hoodie.common.minicluster.HdfsTestService; import com.uber.hoodie.common.minicluster.ZookeeperTestService; import com.uber.hoodie.common.model.CompactionWriteStat; import com.uber.hoodie.common.model.HoodieCommitMetadata; import com.uber.hoodie.common.model.HoodieCompactionMetadata; import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodieDeltaWriteStat; import com.uber.hoodie.common.model.HoodieLogFile; import com.uber.hoodie.common.model.HoodieTableType; import com.uber.hoodie.common.model.HoodieWriteStat; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.log.HoodieLogFormat; import com.uber.hoodie.common.table.log.HoodieLogFormat.Writer; import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.SchemaTestUtil; import com.uber.hoodie.hive.util.HiveTestService; import java.io.File; import java.io.IOException; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.List; import java.util.Map.Entry; import java.util.Set; import java.util.UUID; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hive.service.server.HiveServer2; import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.zookeeper.server.ZooKeeperServer; import org.joda.time.DateTime; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.junit.runners.model.InitializationError; @SuppressWarnings("SameParameterValue") public class TestUtil { private static MiniDFSCluster dfsCluster; private static ZooKeeperServer zkServer; private static HiveServer2 hiveServer; private static Configuration configuration; static HiveSyncConfig hiveSyncConfig; private static DateTimeFormatter dtfOut; static FileSystem fileSystem; private static Set<String> createdTablesSet = Sets.newHashSet(); public static void setUp() throws IOException, InterruptedException, URISyntaxException { if (dfsCluster == null) { HdfsTestService service = new HdfsTestService(); dfsCluster = service.start(true); configuration = service.getHadoopConf(); } if (zkServer == null) { ZookeeperTestService zkService = new ZookeeperTestService(configuration); zkServer = zkService.start(); } if (hiveServer == null) { HiveTestService hiveService = new HiveTestService(configuration); hiveServer = hiveService.start(); } fileSystem = FileSystem.get(configuration); hiveSyncConfig = new HiveSyncConfig(); hiveSyncConfig.jdbcUrl = "jdbc:hive2://127.0.0.1:9999/"; hiveSyncConfig.databaseName = "hdrone_test"; hiveSyncConfig.hiveUser = ""; hiveSyncConfig.hivePass = ""; hiveSyncConfig.databaseName = "testdb"; hiveSyncConfig.tableName = "test1"; hiveSyncConfig.basePath = "/tmp/hdfs/HiveSyncToolTest/"; hiveSyncConfig.assumeDatePartitioning = true; hiveSyncConfig.partitionFields = Lists.newArrayList("datestr"); dtfOut = DateTimeFormat.forPattern("yyyy/MM/dd"); clear(); } static void clear() throws IOException { fileSystem.delete(new Path(hiveSyncConfig.basePath), true); HoodieTableMetaClient.initTableType(fileSystem, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE, hiveSyncConfig.tableName); HoodieHiveClient client = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(), fileSystem); for (String tableName : createdTablesSet) { client.updateHiveSQL("drop table if exists " + tableName); } createdTablesSet.clear(); client.updateHiveSQL("drop database if exists " + hiveSyncConfig.databaseName); client.updateHiveSQL("create database " + hiveSyncConfig.databaseName); } static HiveConf getHiveConf() { return hiveServer.getHiveConf(); } @SuppressWarnings("unused") public static void shutdown() { if (hiveServer != null) { hiveServer.stop(); } if (dfsCluster != null) { dfsCluster.shutdown(); } if (zkServer != null) { zkServer.shutdown(); } } static void createCOWDataset(String commitTime, int numberOfPartitions) throws IOException, InitializationError, URISyntaxException, InterruptedException { Path path = new Path(hiveSyncConfig.basePath); FileUtils.deleteDirectory(new File(hiveSyncConfig.basePath)); HoodieTableMetaClient.initTableType(fileSystem, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE, hiveSyncConfig.tableName); boolean result = fileSystem.mkdirs(path); checkResult(result); DateTime dateTime = DateTime.now(); HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime, commitTime); createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); createCommitFile(commitMetadata, commitTime); } static void createMORDataset(String commitTime, String deltaCommitTime, int numberOfPartitions) throws IOException, InitializationError, URISyntaxException, InterruptedException { Path path = new Path(hiveSyncConfig.basePath); FileUtils.deleteDirectory(new File(hiveSyncConfig.basePath)); HoodieTableMetaClient.initTableType(fileSystem, hiveSyncConfig.basePath, HoodieTableType.MERGE_ON_READ, hiveSyncConfig.tableName); boolean result = fileSystem.mkdirs(path); checkResult(result); DateTime dateTime = DateTime.now(); HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime, commitTime); createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); createdTablesSet.add( hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE); HoodieCompactionMetadata compactionMetadata = new HoodieCompactionMetadata(); commitMetadata.getPartitionToWriteStats() .forEach((key, value) -> value.stream().map(k -> new CompactionWriteStat(k, key, 0, 0, 0)) .forEach(l -> compactionMetadata.addWriteStat(key, l))); createCompactionCommitFile(compactionMetadata, commitTime); // Write a delta commit HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), true); createDeltaCommitFile(deltaMetadata, deltaCommitTime); } static void addCOWPartitions(int numberOfPartitions, boolean isParquetSchemaSimple, DateTime startFrom, String commitTime) throws IOException, URISyntaxException, InterruptedException { HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, isParquetSchemaSimple, startFrom, commitTime); createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); createCommitFile(commitMetadata, commitTime); } static void addMORPartitions(int numberOfPartitions, boolean isParquetSchemaSimple, boolean isLogSchemaSimple, DateTime startFrom, String commitTime, String deltaCommitTime) throws IOException, URISyntaxException, InterruptedException { HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, isParquetSchemaSimple, startFrom, commitTime); createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); createdTablesSet.add( hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE); HoodieCompactionMetadata compactionMetadata = new HoodieCompactionMetadata(); commitMetadata.getPartitionToWriteStats() .forEach((key, value) -> value.stream().map(k -> new CompactionWriteStat(k, key, 0, 0, 0)) .forEach(l -> compactionMetadata.addWriteStat(key, l))); createCompactionCommitFile(compactionMetadata, commitTime); HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), isLogSchemaSimple); createDeltaCommitFile(deltaMetadata, deltaCommitTime); } private static HoodieCommitMetadata createLogFiles(HashMap<String, List<HoodieWriteStat>> partitionWriteStats, boolean isLogSchemaSimple) throws InterruptedException, IOException, URISyntaxException { HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); for (Entry<String, List<HoodieWriteStat>> wEntry : partitionWriteStats.entrySet()) { String partitionPath = wEntry.getKey(); for (HoodieWriteStat wStat : wEntry.getValue()) { Path path = new Path(wStat.getPath()); HoodieDataFile dataFile = new HoodieDataFile(fileSystem.getFileStatus(path)); HoodieLogFile logFile = generateLogData(path, isLogSchemaSimple); HoodieDeltaWriteStat writeStat = new HoodieDeltaWriteStat(); writeStat.setFileId(dataFile.getFileId()); writeStat.setPath(logFile.getPath().toString()); commitMetadata.addWriteStat(partitionPath, writeStat); } } return commitMetadata; } private static HoodieCommitMetadata createPartitions(int numberOfPartitions, boolean isParquetSchemaSimple, DateTime startFrom, String commitTime) throws IOException, URISyntaxException, InterruptedException { startFrom = startFrom.withTimeAtStartOfDay(); HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); for (int i = 0; i < numberOfPartitions; i++) { String partitionPath = dtfOut.print(startFrom); Path partPath = new Path(hiveSyncConfig.basePath + "/" + partitionPath); fileSystem.makeQualified(partPath); fileSystem.mkdirs(partPath); List<HoodieWriteStat> writeStats = createTestData(partPath, isParquetSchemaSimple, commitTime); startFrom = startFrom.minusDays(1); writeStats.forEach(s -> commitMetadata.addWriteStat(partitionPath, s)); } return commitMetadata; } private static List<HoodieWriteStat> createTestData(Path partPath, boolean isParquetSchemaSimple, String commitTime) throws IOException, URISyntaxException, InterruptedException { List<HoodieWriteStat> writeStats = Lists.newArrayList(); for (int i = 0; i < 5; i++) { // Create 5 files String fileId = UUID.randomUUID().toString(); Path filePath = new Path(partPath.toString() + "/" + FSUtils.makeDataFileName(commitTime, DEFAULT_TASK_PARTITIONID, fileId)); generateParquetData(filePath, isParquetSchemaSimple); HoodieWriteStat writeStat = new HoodieWriteStat(); writeStat.setFileId(fileId); writeStat.setPath(filePath.toString()); writeStats.add(writeStat); } return writeStats; } @SuppressWarnings({ "unchecked", "deprecation" }) private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple) throws IOException, URISyntaxException, InterruptedException { Schema schema = (isParquetSchemaSimple ? SchemaTestUtil.getSimpleSchema() : SchemaTestUtil.getEvolvedSchema()); org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema); BloomFilter filter = new BloomFilter(1000, 0.0001); HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter); ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf()); List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100) : SchemaTestUtil.generateEvolvedTestRecords(100, 100)); testRecords.forEach(s -> { try { writer.write(s); } catch (IOException e) { fail("IOException while writing test records as parquet" + e.toString()); } }); writer.close(); } private static HoodieLogFile generateLogData(Path parquetFilePath, boolean isLogSchemaSimple) throws IOException, InterruptedException, URISyntaxException { Schema schema = (isLogSchemaSimple ? SchemaTestUtil.getSimpleSchema() : SchemaTestUtil.getEvolvedSchema()); HoodieDataFile dataFile = new HoodieDataFile(fileSystem.getFileStatus(parquetFilePath)); // Write a log file for this parquet file Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(parquetFilePath.getParent()) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(dataFile.getFileId()) .overBaseCommit(dataFile.getCommitTime()).withFs(fileSystem).build(); List<IndexedRecord> records = (isLogSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100) : SchemaTestUtil.generateEvolvedTestRecords(100, 100)); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, schema); logWriter.appendBlock(dataBlock); logWriter.close(); return logWriter.getLogFile(); } private static void checkResult(boolean result) throws InitializationError { if (!result) { throw new InitializationError("Could not initialize"); } } private static void createCommitFile(HoodieCommitMetadata commitMetadata, String commitTime) throws IOException { byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8); Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeCommitFileName(commitTime)); FSDataOutputStream fsout = fileSystem.create(fullPath, true); fsout.write(bytes); fsout.close(); } private static void createCompactionCommitFile(HoodieCompactionMetadata commitMetadata, String commitTime) throws IOException { byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8); Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeCompactionFileName(commitTime)); FSDataOutputStream fsout = fileSystem.create(fullPath, true); fsout.write(bytes); fsout.close(); } private static void createDeltaCommitFile(HoodieCommitMetadata deltaCommitMetadata, String deltaCommitTime) throws IOException { byte[] bytes = deltaCommitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8); Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeDeltaFileName(deltaCommitTime)); FSDataOutputStream fsout = fileSystem.create(fullPath, true); fsout.write(bytes); fsout.close(); } }