com.uber.hoodie.common.model.HoodieTableMetadata.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.hoodie.common.model.HoodieTableMetadata.java

Source

/*
 * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *          http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie.common.model;

import com.uber.hoodie.common.util.FSUtils;

import com.uber.hoodie.exception.DatasetNotFoundException;
import com.uber.hoodie.exception.HoodieIOException;
import com.uber.hoodie.exception.InvalidDatasetException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.SortedMap;
import java.util.TreeMap;

/**
 * Manages all file system level interactions for the Hoodie tables.
 */
public class HoodieTableMetadata implements Serializable {
    public static final String MAX_COMMIT_TS = String.valueOf(Long.MAX_VALUE);
    public static final String HOODIE_TABLE_NAME_PROP_NAME = "hoodie.table.name";
    public static final String HOODIE_TABLE_TYPE_PROP_NAME = "hoodie.table.type";
    public static final HoodieTableType DEFAULT_TABLE_TYPE = HoodieTableType.COPY_ON_WRITE;

    public static final String HOODIE_PROPERTIES_FILE = "hoodie.properties";
    private static final String HOODIE_HDRONE_PROFILE_DEFAULT_VALUE = "HOODIE";
    private static final java.lang.String HOODIE_HDRONE_PROFILE_PROP_NAME = "hoodie.hdrone.dataset.profile";

    private static Logger log = LogManager.getLogger(HoodieTableMetadata.class);
    private transient final FileSystem fs;
    private transient final Path metadataFolder;
    private final Properties properties;
    private HoodieCommits commits;
    private List<String> inflightCommits;
    private String basePath;

    public static final String METAFOLDER_NAME = ".hoodie";
    public static final String COMMIT_FILE_SUFFIX = ".commit";
    public static final String INFLIGHT_FILE_SUFFIX = ".inflight";

    /**
     * Constructor which initializes the hoodie table metadata. It will initialize the meta-data if not already present.
     *
     * @param fs
     * @param basePath
     * @param tableName
     */
    public HoodieTableMetadata(FileSystem fs, String basePath, String tableName) {
        this(fs, basePath, tableName, true);
    }

    /**
     * Constructor which loads the hoodie table metadata, It requires the meta-data to be present already
     * @param fs
     * @param basePath
     */
    public HoodieTableMetadata(FileSystem fs, String basePath) {
        this(fs, basePath, null, false);
    }

    private HoodieTableMetadata(FileSystem fs, String basePath, String tableName, boolean initOnMissing) {
        this.fs = fs;
        this.basePath = basePath;

        try {
            Path basePathDir = new Path(this.basePath);
            if (!fs.exists(basePathDir)) {
                if (initOnMissing) {
                    fs.mkdirs(basePathDir);
                } else {
                    throw new DatasetNotFoundException(this.basePath);
                }
            }

            if (!fs.isDirectory(new Path(basePath))) {
                throw new DatasetNotFoundException(this.basePath);
            }

            this.metadataFolder = new Path(this.basePath, METAFOLDER_NAME);
            Path propertyPath = new Path(metadataFolder, HOODIE_PROPERTIES_FILE);
            if (!fs.exists(propertyPath)) {
                if (initOnMissing) {
                    // create .hoodie folder if it does not exist.
                    createHoodieProperties(metadataFolder, tableName);
                } else {
                    throw new InvalidDatasetException(this.basePath);
                }
            }

            // Load meta data
            this.commits = new HoodieCommits(scanCommits(COMMIT_FILE_SUFFIX));
            this.inflightCommits = scanCommits(INFLIGHT_FILE_SUFFIX);
            this.properties = readHoodieProperties();
            log.info("All commits :" + commits);
        } catch (IOException e) {
            throw new HoodieIOException("Could not load HoodieMetadata from path " + basePath, e);
        }
    }

    /**
     * Returns all the commit metadata for this table. Reads all the commit files from HDFS.
     * Expensive operation, use with caution.
     *
     * @return SortedMap of CommitTime,HoodieCommitMetadata
     */
    public SortedMap<String, HoodieCommitMetadata> getAllCommitMetadata() {
        try {
            TreeMap<String, HoodieCommitMetadata> metadataMap = new TreeMap<>();
            for (String commitTs : commits.getCommitList()) {
                metadataMap.put(commitTs, getCommitMetadata(commitTs));
            }
            return Collections.unmodifiableSortedMap(metadataMap);
        } catch (IOException e) {
            throw new HoodieIOException("Could not load all commits for table " + getTableName(), e);
        }
    }

    public HoodieCommitMetadata getCommitMetadata(String commitTime) throws IOException {
        FSDataInputStream is = fs.open(new Path(metadataFolder, FSUtils.makeCommitFileName(commitTime)));
        try {
            String jsonStr = IOUtils.toString(is);
            return HoodieCommitMetadata.fromJsonString(jsonStr);
        } finally {
            is.close();
        }
    }

    public HoodieTableType getTableType() {
        return HoodieTableType.valueOf(properties.getProperty(HOODIE_TABLE_TYPE_PROP_NAME));
    }

    /**
     * Lookup the file name for specified <code>HoodieRecord</code>
     *
     * TODO(vc): This metadata needs to be cached in each executor, statically, and used across, if
     * we need to be nicer to the NameNode
     */
    public String getFilenameForRecord(FileSystem fs, final HoodieRecord record) {
        String fileId = record.getCurrentLocation().getFileId();
        return getFilenameForRecord(fs, record, fileId);
    }

    public String getFilenameForRecord(FileSystem fs, final HoodieRecord record, String fileId) {
        try {
            FileStatus[] files = fs.listStatus(new Path(basePath, record.getPartitionPath()));
            Map<String, List<FileStatus>> fileIdToVersions = groupFilesByFileId(files, commits.lastCommit());
            // If the record is not found
            if (!fileIdToVersions.containsKey(fileId)) {
                throw new FileNotFoundException("Cannot find valid versions for fileId " + fileId);
            }

            List<FileStatus> statuses = fileIdToVersions.get(fileId);
            return statuses.get(0).getPath().getName();
        } catch (IOException e) {
            throw new HoodieIOException("Could not get Filename for record " + record, e);
        }
    }

    /**
     * Get only the latest file in the partition with precondition commitTime(file) lt maxCommitTime
     *
     * @param fs
     * @param partitionPathStr
     * @param maxCommitTime
     * @return
     */
    public FileStatus[] getLatestVersionInPartition(FileSystem fs, String partitionPathStr, String maxCommitTime) {
        try {
            Path partitionPath = new Path(basePath, partitionPathStr);
            if (!fs.exists(partitionPath)) {
                return new FileStatus[0];
            }
            FileStatus[] files = fs.listStatus(partitionPath);
            Map<String, List<FileStatus>> fileIdToVersions = groupFilesByFileId(files, commits.lastCommit());
            HashMap<String, FileStatus> validFiles = new HashMap<>();
            for (String fileId : fileIdToVersions.keySet()) {
                List<FileStatus> versions = fileIdToVersions.get(fileId);
                for (FileStatus file : versions) {
                    String filename = file.getPath().getName();
                    String commitTime = FSUtils.getCommitTime(filename);
                    if (HoodieCommits.isCommit1BeforeOrOn(commitTime, maxCommitTime)) {
                        validFiles.put(fileId, file);
                        break;
                    }
                }
            }
            return validFiles.values().toArray(new FileStatus[validFiles.size()]);
        } catch (IOException e) {
            throw new HoodieIOException("Could not get latest versions in Partition " + partitionPathStr, e);
        }
    }

    /**
     * Get ALL the data files in partition grouped by fileId and sorted by the commitTime
     * Given a partition path, provide all the files with a list of their commits, sorted by commit time.
     */
    public Map<String, List<FileStatus>> getAllVersionsInPartition(FileSystem fs, String partitionPath) {
        try {
            FileStatus[] files = fs.listStatus(new Path(basePath, partitionPath));
            return groupFilesByFileId(files, commits.lastCommit());
        } catch (IOException e) {
            throw new HoodieIOException("Could not load all file versions in partition " + partitionPath, e);
        }
    }

    /**
     * Get all the versions of files, within the commit range provided.
     *
     * @param commitsToReturn - commits to include
     */
    public FileStatus[] getLatestVersionInRange(FileStatus[] fileStatuses, List<String> commitsToReturn) {
        if (commitsToReturn.isEmpty()) {
            return new FileStatus[0];
        }
        try {
            Map<String, List<FileStatus>> fileIdToVersions = groupFilesByFileId(fileStatuses, commits.lastCommit());

            List<FileStatus> statuses = new ArrayList<>();
            for (List<FileStatus> entry : fileIdToVersions.values()) {
                for (FileStatus status : entry) {
                    String commitTime = FSUtils.getCommitTime(status.getPath().getName());
                    if (commitsToReturn.contains(commitTime)) {
                        statuses.add(status);
                        break;
                    }
                }
            }
            return statuses.toArray(new FileStatus[statuses.size()]);
        } catch (IOException e) {
            throw new HoodieIOException("Could not filter files from commits " + commitsToReturn, e);
        }
    }

    /**
     *
     * Get the latest versions of all the files.
     *
     * @param fileStatuses
     * @return
     */
    public FileStatus[] getLatestVersions(FileStatus[] fileStatuses) {
        try {
            Map<String, List<FileStatus>> fileIdToVersions = groupFilesByFileId(fileStatuses, commits.lastCommit());

            List<FileStatus> statuses = new ArrayList<>();
            for (List<FileStatus> entry : fileIdToVersions.values()) {
                // first file is the latest one
                statuses.add(entry.get(0));
            }
            return statuses.toArray(new FileStatus[statuses.size()]);
        } catch (IOException e) {
            throw new HoodieIOException("Could not filter files for latest version ", e);
        }
    }

    /**
     * Get the base path for the Hoodie Table
     *
     * @return
     */
    public String getBasePath() {
        return basePath;
    }

    public boolean isCommitsEmpty() {
        return commits.isEmpty();
    }

    public boolean isCommitTsSafe(String commitTs) {
        return !isCommitsEmpty() && (commits.isCommitBeforeEarliestCommit(commitTs) || commits.contains(commitTs));
    }

    public List<String> findCommitsSinceTs(String startTs) {
        return commits.findCommitsInRange(startTs, MAX_COMMIT_TS);
    }

    public List<String> findCommitsInRange(String startTs, String endTs) {
        return commits.findCommitsInRange(startTs, endTs);
    }

    public List<String> findCommitsAfter(String startTs, Integer maxCommits) {
        return commits.findCommitsAfter(startTs, maxCommits);
    }

    public HoodieCommits getAllCommits() {
        return commits;
    }

    public List<String> getAllInflightCommits() {
        return inflightCommits;
    }

    @Override
    public String toString() {
        final StringBuilder sb = new StringBuilder("HoodieTableMetadata{");
        sb.append("commits=").append(commits);
        sb.append('}');
        return sb.toString();
    }

    public String getTableName() {
        return properties.getProperty(HOODIE_TABLE_NAME_PROP_NAME);
    }

    public String getHDroneDatasetProfile() {
        return properties.getProperty(HOODIE_HDRONE_PROFILE_PROP_NAME, HOODIE_HDRONE_PROFILE_DEFAULT_VALUE);
    }

    /**
     * Initialize the hoodie meta directory and any necessary files inside the meta (including the hoodie.properties)
     *
     * @param metadataFolder
     * @param tableName
     * @throws IOException
     */
    private void createHoodieProperties(Path metadataFolder, String tableName) throws IOException {
        if (!fs.exists(metadataFolder)) {
            fs.mkdirs(metadataFolder);
        }
        Path propertyPath = new Path(metadataFolder, HOODIE_PROPERTIES_FILE);
        FSDataOutputStream outputStream = fs.create(propertyPath);
        try {
            Properties props = new Properties();
            props.setProperty(HOODIE_TABLE_NAME_PROP_NAME, tableName);
            props.setProperty(HOODIE_TABLE_TYPE_PROP_NAME, DEFAULT_TABLE_TYPE.name());
            props.store(outputStream, "Properties saved on " + new Date(System.currentTimeMillis()));
        } finally {
            outputStream.close();
        }
    }

    /**
     * Loads the hoodie table properties from the hoodie.properties file under the .hoodie path
     */
    private Properties readHoodieProperties() throws IOException {
        Properties props = new Properties();
        Path propertyPath = new Path(metadataFolder, HOODIE_PROPERTIES_FILE);
        FSDataInputStream inputStream = fs.open(propertyPath);
        try {
            props.load(inputStream);
        } finally {
            inputStream.close();
        }
        return props;
    }

    /**
     * Scan the commit times (only choosing commit file with the given suffix)
     */
    private List<String> scanCommits(final String commitFileSuffix) throws IOException {
        log.info("Attempting to load the commits under " + metadataFolder + " with suffix " + commitFileSuffix);
        final List<String> commitFiles = new ArrayList<>();
        fs.listStatus(metadataFolder, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                if (path.getName().endsWith(commitFileSuffix)) {
                    commitFiles.add(path.getName().split("\\.")[0]);
                    return true;
                }
                return false;
            }
        });
        return commitFiles;
    }

    /**
     * Takes a bunch of file versions, and returns a map keyed by fileId, with the necessary
     * version safety checking. Returns a map of commitTime and Sorted list of FileStats
     * ( by reverse commit time )
     *
     * @param maxCommitTime maximum permissible commit time
     *
     * @return
     */
    private Map<String, List<FileStatus>> groupFilesByFileId(FileStatus[] files, String maxCommitTime)
            throws IOException {
        HashMap<String, List<FileStatus>> fileIdtoVersions = new HashMap<>();
        for (FileStatus file : files) {
            String filename = file.getPath().getName();
            String fileId = FSUtils.getFileId(filename);
            String commitTime = FSUtils.getCommitTime(filename);
            if (isCommitTsSafe(commitTime) && HoodieCommits.isCommit1BeforeOrOn(commitTime, maxCommitTime)) {
                if (!fileIdtoVersions.containsKey(fileId)) {
                    fileIdtoVersions.put(fileId, new ArrayList<FileStatus>());
                }
                fileIdtoVersions.get(fileId).add(file);
            }
        }
        for (Map.Entry<String, List<FileStatus>> entry : fileIdtoVersions.entrySet()) {
            Collections.sort(fileIdtoVersions.get(entry.getKey()), new Comparator<FileStatus>() {
                @Override
                public int compare(FileStatus o1, FileStatus o2) {
                    String o1CommitTime = FSUtils.getCommitTime(o1.getPath().getName());
                    String o2CommitTime = FSUtils.getCommitTime(o2.getPath().getName());
                    // Reverse the order
                    return o2CommitTime.compareTo(o1CommitTime);
                }
            });
        }
        return fileIdtoVersions;
    }

    @Override
    public boolean equals(Object o) {
        if (this == o)
            return true;
        if (o == null || getClass() != o.getClass())
            return false;

        HoodieTableMetadata metadata = (HoodieTableMetadata) o;

        if (commits != null ? !commits.equals(metadata.commits) : metadata.commits != null)
            return false;
        return basePath != null ? basePath.equals(metadata.basePath) : metadata.basePath == null;

    }

    @Override
    public int hashCode() {
        int result = commits != null ? commits.hashCode() : 0;
        result = 31 * result + (basePath != null ? basePath.hashCode() : 0);
        return result;
    }

}