com.uber.hoodie.common.HoodieClientTestUtils.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.hoodie.common.HoodieClientTestUtils.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie.common;

import com.uber.hoodie.HoodieReadClient;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.TableFileSystemView;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.HoodieAvroUtils;
import com.uber.hoodie.config.HoodieStorageConfig;
import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.io.storage.HoodieParquetConfig;
import com.uber.hoodie.io.storage.HoodieParquetWriter;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.UUID;
import java.util.stream.Collectors;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;

/**
 * Utility methods to aid testing inside the HoodieClient module.
 */
public class HoodieClientTestUtils {

    private static final transient Logger log = LogManager.getLogger(HoodieClientTestUtils.class);

    public static List<WriteStatus> collectStatuses(Iterator<List<WriteStatus>> statusListItr) {
        List<WriteStatus> statuses = new ArrayList<>();
        while (statusListItr.hasNext()) {
            statuses.addAll(statusListItr.next());
        }
        return statuses;
    }

    public static Set<String> getRecordKeys(List<HoodieRecord> hoodieRecords) {
        Set<String> keys = new HashSet<>();
        for (HoodieRecord rec : hoodieRecords) {
            keys.add(rec.getRecordKey());
        }
        return keys;
    }

    private static void fakeMetaFile(String basePath, String commitTime, String suffix) throws IOException {
        String parentPath = basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME;
        new File(parentPath).mkdirs();
        new File(parentPath + "/" + commitTime + suffix).createNewFile();
    }

    public static void fakeCommitFile(String basePath, String commitTime) throws IOException {
        fakeMetaFile(basePath, commitTime, HoodieTimeline.COMMIT_EXTENSION);
    }

    public static void fakeInFlightFile(String basePath, String commitTime) throws IOException {
        fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION);
    }

    public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId)
            throws Exception {
        fakeDataFile(basePath, partitionPath, commitTime, fileId, 0);
    }

    public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId,
            long length) throws Exception {
        String parentPath = String.format("%s/%s", basePath, partitionPath);
        new File(parentPath).mkdirs();
        String path = String.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, "1-0-1", fileId));
        new File(path).createNewFile();
        new RandomAccessFile(path, "rw").setLength(length);
    }

    public static SparkConf getSparkConfForTest(String appName) {
        SparkConf sparkConf = new SparkConf().setAppName(appName)
                .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").setMaster("local[8]");
        return HoodieReadClient.addHoodieSupport(sparkConf);
    }

    public static HashMap<String, String> getLatestFileIDsToFullPath(String basePath, HoodieTimeline commitTimeline,
            List<HoodieInstant> commitsToReturn) throws IOException {
        HashMap<String, String> fileIdToFullPath = new HashMap<>();
        for (HoodieInstant commit : commitsToReturn) {
            HoodieCommitMetadata metadata = HoodieCommitMetadata
                    .fromBytes(commitTimeline.getInstantDetails(commit).get(), HoodieCommitMetadata.class);
            fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths(basePath));
        }
        return fileIdToFullPath;
    }

    public static Dataset<Row> readCommit(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline,
            String commitTime) {
        HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
        if (!commitTimeline.containsInstant(commitInstant)) {
            new HoodieException("No commit exists at " + commitTime);
        }
        try {
            HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline,
                    Arrays.asList(commitInstant));
            log.info("Path :" + paths.values());
            return sqlContext.read().parquet(paths.values().toArray(new String[paths.size()]))
                    .filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime));
        } catch (Exception e) {
            throw new HoodieException("Error reading commit " + commitTime, e);
        }
    }

    /**
     * Obtain all new data written into the Hoodie dataset since the given timestamp.
     */
    public static Dataset<Row> readSince(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline,
            String lastCommitTime) {
        List<HoodieInstant> commitsToReturn = commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE)
                .getInstants().collect(Collectors.toList());
        try {
            // Go over the commit metadata, and obtain the new files that need to be read.
            HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline,
                    commitsToReturn);
            return sqlContext.read().parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
                    .filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
        } catch (IOException e) {
            throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime,
                    e);
        }
    }

    /**
     * Reads the paths under the a hoodie dataset out as a DataFrame
     */
    public static Dataset<Row> read(JavaSparkContext jsc, String basePath, SQLContext sqlContext, FileSystem fs,
            String... paths) {
        List<String> filteredPaths = new ArrayList<>();
        try {
            HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true);
            for (String path : paths) {
                TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(metaClient,
                        metaClient.getCommitsTimeline().filterCompletedInstants(), fs.globStatus(new Path(path)));
                List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(Collectors.toList());
                for (HoodieDataFile file : latestFiles) {
                    filteredPaths.add(file.getPath());
                }
            }
            return sqlContext.read().parquet(filteredPaths.toArray(new String[filteredPaths.size()]));
        } catch (Exception e) {
            throw new HoodieException("Error reading hoodie dataset as a dataframe", e);
        }
    }

    public static String writeParquetFile(String basePath, String partitionPath, String filename,
            List<HoodieRecord> records, Schema schema, BloomFilter filter, boolean createCommitTime)
            throws IOException {

        if (filter == null) {
            filter = new BloomFilter(10000, 0.0000001);
        }
        HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema),
                schema, filter);
        String commitTime = FSUtils.getCommitTime(filename);
        HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
                ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
                HoodieTestUtils.getDefaultHadoopConf(),
                Double.valueOf(HoodieStorageConfig.DEFAULT_STREAM_COMPRESSION_RATIO));
        HoodieParquetWriter writer = new HoodieParquetWriter(commitTime,
                new Path(basePath + "/" + partitionPath + "/" + filename), config, schema);
        int seqId = 1;
        for (HoodieRecord record : records) {
            GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
            HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++);
            HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(),
                    filename);
            writer.writeAvro(record.getRecordKey(), avroRecord);
            filter.add(record.getRecordKey());
        }
        writer.close();

        if (createCommitTime) {
            HoodieTestUtils.createMetadataFolder(basePath);
            HoodieTestUtils.createCommitFiles(basePath, commitTime);
        }
        return filename;
    }

    public static String writeParquetFile(String basePath, String partitionPath, List<HoodieRecord> records,
            Schema schema, BloomFilter filter, boolean createCommitTime) throws IOException, InterruptedException {
        Thread.sleep(1000);
        String commitTime = HoodieTestUtils.makeNewCommitTime();
        String fileId = UUID.randomUUID().toString();
        String filename = FSUtils.makeDataFileName(commitTime, "1-0-1", fileId);
        HoodieTestUtils.createCommitFiles(basePath, commitTime);
        return HoodieClientTestUtils.writeParquetFile(basePath, partitionPath, filename, records, schema, filter,
                createCommitTime);
    }
}