com.uber.hoodie.common.table.HoodieTableMetaClient.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.hoodie.common.table.HoodieTableMetaClient.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie.common.table;

import static com.uber.hoodie.common.model.HoodieTableType.MERGE_ON_READ;

import com.google.common.base.Preconditions;
import com.uber.hoodie.common.SerializableConfiguration;
import com.uber.hoodie.common.io.storage.HoodieWrapperFileSystem;
import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieArchivedTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.ConsistencyGuardConfig;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.FailSafeConsistencyGuard;
import com.uber.hoodie.common.util.NoOpConsistencyGuard;
import com.uber.hoodie.exception.DatasetNotFoundException;
import com.uber.hoodie.exception.HoodieException;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.Objects;
import java.util.Properties;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

/**
 * <code>HoodieTableMetaClient</code> allows to access meta-data about a hoodie table It returns
 * meta-data about commits, savepoints, compactions, cleanups as a <code>HoodieTimeline</code>
 * Create an instance of the <code>HoodieTableMetaClient</code> with FileSystem and basePath to
 * start getting the meta-data. <p> All the timelines are computed lazily, once computed the
 * timeline is cached and never refreshed. Use the <code>HoodieTimeline.reload()</code> to refresh
 * timelines.
 *
 * @see HoodieTimeline
 * @since 0.3.0
 */
public class HoodieTableMetaClient implements Serializable {

    private static final transient Logger log = LogManager.getLogger(HoodieTableMetaClient.class);
    public static String METAFOLDER_NAME = ".hoodie";
    public static String TEMPFOLDER_NAME = METAFOLDER_NAME + File.separator + ".temp";
    public static String AUXILIARYFOLDER_NAME = METAFOLDER_NAME + File.separator + ".aux";
    public static final String MARKER_EXTN = ".marker";

    private String basePath;
    private transient HoodieWrapperFileSystem fs;
    private String metaPath;
    private SerializableConfiguration hadoopConf;
    private HoodieTableType tableType;
    private HoodieTableConfig tableConfig;
    private HoodieActiveTimeline activeTimeline;
    private HoodieArchivedTimeline archivedTimeline;
    private ConsistencyGuardConfig consistencyGuardConfig = ConsistencyGuardConfig.newBuilder().build();

    public HoodieTableMetaClient(Configuration conf, String basePath) throws DatasetNotFoundException {
        // Do not load any timeline by default
        this(conf, basePath, false);
    }

    public HoodieTableMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad) {
        this(conf, basePath, loadActiveTimelineOnLoad, ConsistencyGuardConfig.newBuilder().build());
    }

    public HoodieTableMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad,
            ConsistencyGuardConfig consistencyGuardConfig) throws DatasetNotFoundException {
        log.info("Loading HoodieTableMetaClient from " + basePath);
        this.basePath = basePath;
        this.consistencyGuardConfig = consistencyGuardConfig;
        this.hadoopConf = new SerializableConfiguration(conf);
        Path basePathDir = new Path(this.basePath);
        this.metaPath = new Path(basePath, METAFOLDER_NAME).toString();
        Path metaPathDir = new Path(this.metaPath);
        this.fs = getFs();
        DatasetNotFoundException.checkValidDataset(fs, basePathDir, metaPathDir);
        this.tableConfig = new HoodieTableConfig(fs, metaPath);
        this.tableType = tableConfig.getTableType();
        log.info("Finished Loading Table of type " + tableType + " from " + basePath);
        if (loadActiveTimelineOnLoad) {
            log.info("Loading Active commit timeline for " + basePath);
            getActiveTimeline();
        }
    }

    /**
     * For serailizing and de-serializing
     *
     * @deprecated
     */
    public HoodieTableMetaClient() {
    }

    /**
     * This method is only used when this object is deserialized in a spark executor.
     *
     * @deprecated
     */
    private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException {
        in.defaultReadObject();
        fs = null; // will be lazily inited
    }

    private void writeObject(java.io.ObjectOutputStream out) throws IOException {
        out.defaultWriteObject();
    }

    /**
     * @return Base path
     */
    public String getBasePath() {
        return basePath;
    }

    /**
     * @return Hoodie Table Type
     */
    public HoodieTableType getTableType() {
        return tableType;
    }

    /**
     * @return Meta path
     */
    public String getMetaPath() {
        return metaPath;
    }

    /**
     * @return Temp Folder path
     */
    public String getTempFolderPath() {
        return basePath + File.separator + TEMPFOLDER_NAME;
    }

    /**
     *  Returns Marker folder path
      * @param instantTs Instant Timestamp
     * @return
     */
    public String getMarkerFolderPath(String instantTs) {
        return String.format("%s%s%s", getTempFolderPath(), File.separator, instantTs);
    }

    /**
     * @return Auxiliary Meta path
     */
    public String getMetaAuxiliaryPath() {
        return basePath + File.separator + AUXILIARYFOLDER_NAME;
    }

    /**
     * @return path where archived timeline is stored
     */
    public String getArchivePath() {
        String archiveFolder = tableConfig.getArchivelogFolder();
        if (archiveFolder.equals(HoodieTableConfig.DEFAULT_ARCHIVELOG_FOLDER)) {
            return getMetaPath();
        } else {
            return getMetaPath() + "/" + archiveFolder;
        }
    }

    /**
     * @return Table Config
     */
    public HoodieTableConfig getTableConfig() {
        return tableConfig;
    }

    /**
     * Get the FS implementation for this table
     */
    public HoodieWrapperFileSystem getFs() {
        if (fs == null) {
            FileSystem fileSystem = FSUtils.getFs(metaPath, hadoopConf.get());
            Preconditions.checkArgument(!(fileSystem instanceof HoodieWrapperFileSystem),
                    "File System not expected to be that of HoodieWrapperFileSystem");
            fs = new HoodieWrapperFileSystem(fileSystem,
                    consistencyGuardConfig.isConsistencyCheckEnabled()
                            ? new FailSafeConsistencyGuard(fileSystem, consistencyGuardConfig)
                            : new NoOpConsistencyGuard());
        }
        return fs;
    }

    /**
     * Return raw file-system
     * @return
     */
    public FileSystem getRawFs() {
        return getFs().getFileSystem();
    }

    public Configuration getHadoopConf() {
        return hadoopConf.get();
    }

    /**
     * Get the active instants as a timeline
     *
     * @return Active instants timeline
     */
    public synchronized HoodieActiveTimeline getActiveTimeline() {
        if (activeTimeline == null) {
            activeTimeline = new HoodieActiveTimeline(this);
        }
        return activeTimeline;
    }

    /**
     * Reload ActiveTimeline and cache
     *
     * @return Active instants timeline
     */
    public synchronized HoodieActiveTimeline reloadActiveTimeline() {
        activeTimeline = new HoodieActiveTimeline(this);
        return activeTimeline;
    }

    public ConsistencyGuardConfig getConsistencyGuardConfig() {
        return consistencyGuardConfig;
    }

    /**
     * Get the archived commits as a timeline. This is costly operation, as all data from the archived
     * files are read. This should not be used, unless for historical debugging purposes
     *
     * @return Active commit timeline
     */
    public synchronized HoodieArchivedTimeline getArchivedTimeline() {
        if (archivedTimeline == null) {
            archivedTimeline = new HoodieArchivedTimeline(this);
        }
        return archivedTimeline;
    }

    /**
     * Helper method to initialize a dataset, with given basePath, tableType, name, archiveFolder
     */
    public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, String tableType,
            String tableName, String archiveLogFolder) throws IOException {
        HoodieTableType type = HoodieTableType.valueOf(tableType);
        Properties properties = new Properties();
        properties.put(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tableName);
        properties.put(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, type.name());
        properties.put(HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP_NAME, archiveLogFolder);
        return HoodieTableMetaClient.initializePathAsHoodieDataset(hadoopConf, basePath, properties);
    }

    /**
     * Helper method to initialize a given path, as a given storage type and table name
     */
    public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath,
            HoodieTableType tableType, String tableName, String payloadClassName) throws IOException {
        Properties properties = new Properties();
        properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tableName);
        properties.setProperty(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, tableType.name());
        if (tableType == MERGE_ON_READ) {
            properties.setProperty(HoodieTableConfig.HOODIE_PAYLOAD_CLASS_PROP_NAME, payloadClassName);
        }
        return HoodieTableMetaClient.initializePathAsHoodieDataset(hadoopConf, basePath, properties);
    }

    /**
     * Helper method to initialize a given path as a hoodie dataset with configs passed in as as
     * Properties
     *
     * @return Instance of HoodieTableMetaClient
     */
    public static HoodieTableMetaClient initializePathAsHoodieDataset(Configuration hadoopConf, String basePath,
            Properties props) throws IOException {
        log.info("Initializing " + basePath + " as hoodie dataset " + basePath);
        Path basePathDir = new Path(basePath);
        final FileSystem fs = FSUtils.getFs(basePath, hadoopConf);
        if (!fs.exists(basePathDir)) {
            fs.mkdirs(basePathDir);
        }
        Path metaPathDir = new Path(basePath, METAFOLDER_NAME);
        if (!fs.exists(metaPathDir)) {
            fs.mkdirs(metaPathDir);
        }

        // if anything other than default archive log folder is specified, create that too
        String archiveLogPropVal = props.getProperty(HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP_NAME,
                HoodieTableConfig.DEFAULT_ARCHIVELOG_FOLDER);
        if (!archiveLogPropVal.equals(HoodieTableConfig.DEFAULT_ARCHIVELOG_FOLDER)) {
            Path archiveLogDir = new Path(metaPathDir, archiveLogPropVal);
            if (!fs.exists(archiveLogDir)) {
                fs.mkdirs(archiveLogDir);
            }
        }

        // Always create temporaryFolder which is needed for finalizeWrite for Hoodie tables
        final Path temporaryFolder = new Path(basePath, HoodieTableMetaClient.TEMPFOLDER_NAME);
        if (!fs.exists(temporaryFolder)) {
            fs.mkdirs(temporaryFolder);
        }

        // Always create auxiliary folder which is needed to track compaction workloads (stats and any metadata in future)
        final Path auxiliaryFolder = new Path(basePath, HoodieTableMetaClient.AUXILIARYFOLDER_NAME);
        if (!fs.exists(auxiliaryFolder)) {
            fs.mkdirs(auxiliaryFolder);
        }

        HoodieTableConfig.createHoodieProperties(fs, metaPathDir, props);
        // We should not use fs.getConf as this might be different from the original configuration
        // used to create the fs in unit tests
        HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, basePath);
        log.info("Finished initializing Table of type " + metaClient.getTableConfig().getTableType() + " from "
                + basePath);
        return metaClient;
    }

    // HELPER METHODS TO CREATE META FILE NAMES
    public static FileStatus[] scanFiles(FileSystem fs, Path metaPath, PathFilter nameFilter) throws IOException {
        return fs.listStatus(metaPath, nameFilter);
    }

    /**
     * Get the commit timeline visible for this table
     */
    public HoodieTimeline getCommitsTimeline() {
        switch (this.getTableType()) {
        case COPY_ON_WRITE:
            return getActiveTimeline().getCommitTimeline();
        case MERGE_ON_READ:
            // We need to include the parquet files written out in delta commits
            // Include commit action to be able to start doing a MOR over a COW dataset - no
            // migration required
            return getActiveTimeline().getCommitsTimeline();
        default:
            throw new HoodieException("Unsupported table type :" + this.getTableType());
        }
    }

    /**
     * Get the commit + pending-compaction timeline visible for this table.
     * A RT filesystem view is constructed with this timeline so that file-slice after pending compaction-requested
     * instant-time is also considered valid. A RT file-system view for reading must then merge the file-slices before
     * and after pending compaction instant so that all delta-commits are read.
     */
    public HoodieTimeline getCommitsAndCompactionTimeline() {
        switch (this.getTableType()) {
        case COPY_ON_WRITE:
            return getActiveTimeline().getCommitTimeline();
        case MERGE_ON_READ:
            return getActiveTimeline().getCommitsAndCompactionTimeline();
        default:
            throw new HoodieException("Unsupported table type :" + this.getTableType());
        }
    }

    /**
     * Get the compacted commit timeline visible for this table
     */
    public HoodieTimeline getCommitTimeline() {
        switch (this.getTableType()) {
        case COPY_ON_WRITE:
        case MERGE_ON_READ:
            // We need to include the parquet files written out in delta commits in tagging
            return getActiveTimeline().getCommitTimeline();
        default:
            throw new HoodieException("Unsupported table type :" + this.getTableType());
        }
    }

    /**
     * Gets the commit action type
     */
    public String getCommitActionType() {
        switch (this.getTableType()) {
        case COPY_ON_WRITE:
            return HoodieActiveTimeline.COMMIT_ACTION;
        case MERGE_ON_READ:
            return HoodieActiveTimeline.DELTA_COMMIT_ACTION;
        default:
            throw new HoodieException("Could not commit on unknown storage type " + this.getTableType());
        }
    }

    /**
     * Helper method to scan all hoodie-instant metafiles and construct HoodieInstant objects
     *
     * @param fs                 FileSystem
     * @param metaPath           Meta Path where hoodie instants are present
     * @param includedExtensions Included hoodie extensions
     * @return List of Hoodie Instants generated
     * @throws IOException in case of failure
     */
    public static List<HoodieInstant> scanHoodieInstantsFromFileSystem(FileSystem fs, Path metaPath,
            Set<String> includedExtensions) throws IOException {
        return Arrays.stream(HoodieTableMetaClient.scanFiles(fs, metaPath, path -> {
            // Include only the meta files with extensions that needs to be included
            String extension = FSUtils.getFileExtension(path.getName());
            return includedExtensions.contains(extension);
        })).sorted(Comparator.comparing(
                // Sort the meta-data by the instant time (first part of the file name)
                fileStatus -> FSUtils.getInstantTime(fileStatus.getPath().getName())))
                // create HoodieInstantMarkers from FileStatus, which extracts properties
                .map(HoodieInstant::new).collect(Collectors.toList());
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) {
            return true;
        }
        if (o == null || getClass() != o.getClass()) {
            return false;
        }
        HoodieTableMetaClient that = (HoodieTableMetaClient) o;
        return Objects.equals(basePath, that.basePath) && tableType == that.tableType;
    }

    @Override
    public int hashCode() {
        return Objects.hash(basePath, tableType);
    }

    @Override
    public String toString() {
        final StringBuilder sb = new StringBuilder("HoodieTableMetaClient{");
        sb.append("basePath='").append(basePath).append('\'');
        sb.append(", metaPath='").append(metaPath).append('\'');
        sb.append(", tableType=").append(tableType);
        sb.append('}');
        return sb.toString();
    }

    public void setBasePath(String basePath) {
        this.basePath = basePath;
    }

    public void setMetaPath(String metaPath) {
        this.metaPath = metaPath;
    }

    public void setActiveTimeline(HoodieActiveTimeline activeTimeline) {
        this.activeTimeline = activeTimeline;
    }

    public void setTableConfig(HoodieTableConfig tableConfig) {
        this.tableConfig = tableConfig;
    }
}