com.uber.hoodie.hive.util.TestUtil.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.hoodie.hive.util.TestUtil.java

Source

/*
 * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *           http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie.hive.util;

import com.google.common.collect.Sets;
import com.uber.hoodie.common.minicluster.HdfsTestService;
import com.uber.hoodie.common.minicluster.ZookeeperTestService;
import com.uber.hoodie.hive.HoodieHiveConfiguration;
import com.uber.hoodie.hive.client.HoodieHiveClient;
import com.uber.hoodie.hive.model.HoodieDatasetReference;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hive.service.server.HiveServer2;
import org.apache.zookeeper.server.ZooKeeperServer;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.junit.runners.model.InitializationError;
import parquet.schema.MessageType;
import parquet.schema.MessageTypeParser;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.Set;
import java.util.regex.Pattern;

public class TestUtil {
    private static MiniDFSCluster dfsCluster;
    private static ZooKeeperServer zkServer;
    private static HiveServer2 hiveServer;
    public static Configuration configuration;
    public static HoodieHiveConfiguration hDroneConfiguration;
    private static DateTimeFormatter dtfOut;
    public static final String CSV_DELIMITER = "|";
    private static FileSystem fileSystem;
    private static Set<String> createdTablesSet = Sets.newHashSet();

    public static void setUp() throws IOException, InterruptedException {
        if (dfsCluster == null) {
            HdfsTestService service = new HdfsTestService();
            dfsCluster = service.start(true);
            configuration = service.getHadoopConf();
        }
        if (zkServer == null) {
            ZookeeperTestService zkService = new ZookeeperTestService(configuration);
            zkServer = zkService.start();
        }
        if (hiveServer == null) {
            HiveTestService hiveService = new HiveTestService(configuration);
            hiveServer = hiveService.start();
        }
        hDroneConfiguration = HoodieHiveConfiguration.newBuilder().hiveJdbcUrl("jdbc:hive2://127.0.0.1:9999/")
                .hivedb("hdrone_test").jdbcUsername("").jdbcPassword("")
                .hadoopConfiguration(hiveServer.getHiveConf()).build();
        dtfOut = DateTimeFormat.forPattern("yyyy/MM/dd");

        HoodieHiveClient client = new HoodieHiveClient(hDroneConfiguration);
        for (String tableName : createdTablesSet) {
            client.updateHiveSQL("drop table if exists " + tableName);
        }
        createdTablesSet.clear();
        client.updateHiveSQL("drop database if exists " + hDroneConfiguration.getDbName());
        client.updateHiveSQL("create database " + hDroneConfiguration.getDbName());

        fileSystem = FileSystem.get(configuration);
    }

    public static void shutdown() {
        if (hiveServer != null) {
            hiveServer.stop();
        }
        if (dfsCluster != null) {
            dfsCluster.shutdown();
        }
        if (zkServer != null) {
            zkServer.shutdown();
        }
    }

    public static HoodieDatasetReference createDataset(String tableName, String hdfsPath, int numberOfPartitions,
            String schemaFile) throws IOException, InitializationError {
        Path path = new Path(hdfsPath);
        FileUtils.deleteDirectory(new File(hdfsPath));

        boolean result = fileSystem.mkdirs(path);
        checkResult(result);
        HoodieDatasetReference metadata = new HoodieDatasetReference(tableName, path.toString(),
                hDroneConfiguration.getDbName());
        DateTime dateTime = DateTime.now();
        createPartitions(metadata, numberOfPartitions, schemaFile, dateTime, 1);
        createdTablesSet.add(metadata.getDatabaseTableName());
        return metadata;
    }

    private static void createPartitions(HoodieDatasetReference metadata, int numberOfPartitions, String schemaFile,
            DateTime startFrom, int schemaVersion) throws IOException {
        startFrom = startFrom.withTimeAtStartOfDay();

        for (int i = 0; i < numberOfPartitions; i++) {
            Path partPath = new Path(metadata.getBaseDatasetPath() + "/" + dtfOut.print(startFrom));
            fileSystem.makeQualified(partPath);
            fileSystem.mkdirs(partPath);
            createTestData(partPath, schemaFile, schemaVersion);
            startFrom = startFrom.minusDays(1);
        }
    }

    private static void createTestData(Path partPath, String schemaFile, int schemaVersion) throws IOException {
        for (int i = 0; i < 5; i++) {
            // Create 5 files
            Path filePath = new Path(partPath.toString() + "/" + getParquetFilePath(schemaVersion, i));
            generateParquetData(filePath, schemaFile);
        }
    }

    private static String getParquetFilePath(int version, int iteration) {
        return "test.topic.name@sjc1@SV_" + version + "@" + iteration + ".parquet";
    }

    public static MessageType readSchema(String schemaFile) throws IOException {
        return MessageTypeParser.parseMessageType(IOUtils.toString(TestUtil.class.getResourceAsStream(schemaFile)));
    }

    public static void generateParquetData(Path filePath, String schemaFile) throws IOException {
        MessageType schema = readSchema(schemaFile);
        CsvParquetWriter writer = new CsvParquetWriter(filePath, schema);

        BufferedReader br = new BufferedReader(
                new InputStreamReader(TestUtil.class.getResourceAsStream(getDataFile(schemaFile))));
        String line;
        try {
            while ((line = br.readLine()) != null) {
                String[] fields = line.split(Pattern.quote(CSV_DELIMITER));
                writer.write(Arrays.asList(fields));
            }
            writer.close();
        } finally {
            br.close();
        }

        InputStreamReader io = null;
        FSDataOutputStream hdfsPath = null;
        try {
            io = new FileReader(filePath.toString());
            hdfsPath = fileSystem.create(filePath);
            IOUtils.copy(io, hdfsPath);
        } finally {
            if (io != null) {
                io.close();
            }
            if (hdfsPath != null) {
                hdfsPath.close();
            }
        }
    }

    private static String getDataFile(String schemaFile) {
        return schemaFile.replaceAll(".schema", ".csv");
    }

    private static void checkResult(boolean result) throws InitializationError {
        if (!result) {
            throw new InitializationError("Could not initialize");
        }
    }

    public static void evolveDataset(HoodieDatasetReference metadata, int newPartitionCount, String newSchema,
            Long startFrom, int schemaVersion) throws IOException {
        createPartitions(metadata, newPartitionCount, newSchema,
                new DateTime(startFrom).plusDays(newPartitionCount + 1), schemaVersion);
    }
}