com.uber.hoodie.common.table.view.HoodieTableFileSystemView.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.hoodie.common.table.view.HoodieTableFileSystemView.java

Source

/*
 *  Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *           http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package com.uber.hoodie.common.table.view;

import com.uber.hoodie.common.model.FileSlice;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieFileGroup;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.TableFileSystemView;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.model.HoodieLogFile;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.exception.HoodieIOException;

import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * Common abstract implementation for multiple TableFileSystemView Implementations. 2 possible
 * implementations are ReadOptimizedView and RealtimeView <p> Concrete implementations extending
 * this abstract class, should only implement getDataFilesInPartition which includes files to be
 * included in the view
 *
 * @see TableFileSystemView
 * @since 0.3.0
 */
public class HoodieTableFileSystemView implements TableFileSystemView, TableFileSystemView.ReadOptimizedView,
        TableFileSystemView.RealtimeView, Serializable {

    protected HoodieTableMetaClient metaClient;
    protected transient FileSystem fs;
    // This is the commits that will be visible for all views extending this view
    protected HoodieTimeline visibleActiveTimeline;

    // mapping from partition paths to file groups contained within them
    protected HashMap<String, List<HoodieFileGroup>> partitionToFileGroupsMap;
    // mapping from file id to the file group.
    protected HashMap<String, HoodieFileGroup> fileGroupMap;

    /**
     * Create a file system view, as of the given timeline
     *
     * @param metaClient
     * @param visibleActiveTimeline
     */
    public HoodieTableFileSystemView(HoodieTableMetaClient metaClient, HoodieTimeline visibleActiveTimeline) {
        this.metaClient = metaClient;
        this.fs = metaClient.getFs();
        this.visibleActiveTimeline = visibleActiveTimeline;
        this.fileGroupMap = new HashMap<>();
        this.partitionToFileGroupsMap = new HashMap<>();
    }

    /**
     * Create a file system view, as of the given timeline, with the provided file statuses.
     *
     * @param metaClient
     * @param visibleActiveTimeline
     * @param fileStatuses
     */
    public HoodieTableFileSystemView(HoodieTableMetaClient metaClient, HoodieTimeline visibleActiveTimeline,
            FileStatus[] fileStatuses) {
        this(metaClient, visibleActiveTimeline);
        addFilesToView(fileStatuses);
    }

    /**
     * This method is only used when this object is deserialized in a spark executor.
     *
     * @deprecated
     */
    private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException {
        in.defaultReadObject();
        this.fs = FSUtils.getFs();
    }

    private void writeObject(java.io.ObjectOutputStream out) throws IOException {
        out.defaultWriteObject();
    }

    /**
     * Adds the provided statuses into the file system view, and also caches it inside this object.
     *
     * @param statuses
     * @return
     */
    private List<HoodieFileGroup> addFilesToView(FileStatus[] statuses) {
        Map<Pair<String, String>, List<HoodieDataFile>> dataFiles = convertFileStatusesToDataFiles(statuses)
                .collect(Collectors.groupingBy((dataFile) -> {
                    String partitionPathStr = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()),
                            dataFile.getFileStatus().getPath().getParent());
                    return Pair.of(partitionPathStr, dataFile.getFileId());
                }));
        Map<Pair<String, String>, List<HoodieLogFile>> logFiles = convertFileStatusesToLogFiles(statuses)
                .collect(Collectors.groupingBy((logFile) -> {
                    String partitionPathStr = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()),
                            logFile.getPath().getParent());
                    return Pair.of(partitionPathStr, logFile.getFileId());
                }));

        Set<Pair<String, String>> fileIdSet = new HashSet<>(dataFiles.keySet());
        fileIdSet.addAll(logFiles.keySet());

        List<HoodieFileGroup> fileGroups = new ArrayList<>();
        fileIdSet.forEach(pair -> {
            HoodieFileGroup group = new HoodieFileGroup(pair.getKey(), pair.getValue(), visibleActiveTimeline);
            if (dataFiles.containsKey(pair)) {
                dataFiles.get(pair).forEach(dataFile -> group.addDataFile(dataFile));
            }
            if (logFiles.containsKey(pair)) {
                logFiles.get(pair).forEach(logFile -> group.addLogFile(logFile));
            }
            fileGroups.add(group);
        });

        // add to the cache.
        fileGroups.forEach(group -> {
            fileGroupMap.put(group.getId(), group);
            if (!partitionToFileGroupsMap.containsKey(group.getPartitionPath())) {
                partitionToFileGroupsMap.put(group.getPartitionPath(), new ArrayList<>());
            }
            partitionToFileGroupsMap.get(group.getPartitionPath()).add(group);
        });

        return fileGroups;
    }

    private Stream<HoodieDataFile> convertFileStatusesToDataFiles(FileStatus[] statuses) {
        Predicate<FileStatus> roFilePredicate = fileStatus -> fileStatus.getPath().getName()
                .contains(metaClient.getTableConfig().getROFileFormat().getFileExtension());
        return Arrays.stream(statuses).filter(roFilePredicate).map(HoodieDataFile::new);
    }

    private Stream<HoodieLogFile> convertFileStatusesToLogFiles(FileStatus[] statuses) {
        Predicate<FileStatus> rtFilePredicate = fileStatus -> fileStatus.getPath().getName()
                .contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension());
        return Arrays.stream(statuses).filter(rtFilePredicate).map(HoodieLogFile::new);
    }

    @Override
    public Stream<HoodieDataFile> getLatestDataFiles(final String partitionPath) {
        return getAllFileGroups(partitionPath).map(fileGroup -> fileGroup.getLatestDataFile())
                .filter(dataFileOpt -> dataFileOpt.isPresent()).map(Optional::get);
    }

    @Override
    public Stream<HoodieDataFile> getLatestDataFiles() {
        return fileGroupMap.values().stream().map(fileGroup -> fileGroup.getLatestDataFile())
                .filter(dataFileOpt -> dataFileOpt.isPresent()).map(Optional::get);
    }

    @Override
    public Stream<HoodieDataFile> getLatestDataFilesBeforeOrOn(String partitionPath, String maxCommitTime) {
        return getAllFileGroups(partitionPath)
                .map(fileGroup -> fileGroup.getLatestDataFileBeforeOrOn(maxCommitTime))
                .filter(dataFileOpt -> dataFileOpt.isPresent()).map(Optional::get);
    }

    @Override
    public Stream<HoodieDataFile> getLatestDataFilesInRange(List<String> commitsToReturn) {
        return fileGroupMap.values().stream().map(fileGroup -> fileGroup.getLatestDataFileInRange(commitsToReturn))
                .filter(dataFileOpt -> dataFileOpt.isPresent()).map(Optional::get);
    }

    @Override
    public Stream<HoodieDataFile> getAllDataFiles(String partitionPath) {
        return getAllFileGroups(partitionPath).map(fileGroup -> fileGroup.getAllDataFiles())
                .flatMap(dataFileList -> dataFileList);
    }

    @Override
    public Stream<FileSlice> getLatestFileSlices(String partitionPath) {
        return getAllFileGroups(partitionPath).map(fileGroup -> fileGroup.getLatestFileSlice())
                .filter(dataFileOpt -> dataFileOpt.isPresent()).map(Optional::get);
    }

    @Override
    public Stream<FileSlice> getLatestFileSlicesBeforeOrOn(String partitionPath, String maxCommitTime) {
        return getAllFileGroups(partitionPath)
                .map(fileGroup -> fileGroup.getLatestFileSliceBeforeOrOn(maxCommitTime))
                .filter(dataFileOpt -> dataFileOpt.isPresent()).map(Optional::get);
    }

    @Override
    public Stream<FileSlice> getLatestFileSliceInRange(List<String> commitsToReturn) {
        return fileGroupMap.values().stream().map(fileGroup -> fileGroup.getLatestFileSliceInRange(commitsToReturn))
                .filter(dataFileOpt -> dataFileOpt.isPresent()).map(Optional::get);
    }

    @Override
    public Stream<FileSlice> getAllFileSlices(String partitionPath) {
        return getAllFileGroups(partitionPath).map(group -> group.getAllFileSlices())
                .flatMap(sliceList -> sliceList);
    }

    /**
     * Given a partition path, obtain all filegroups within that. All methods, that work at the partition level
     * go through this.
     */
    @Override
    public Stream<HoodieFileGroup> getAllFileGroups(String partitionPathStr) {
        // return any previously fetched groups.
        if (partitionToFileGroupsMap.containsKey(partitionPathStr)) {
            return partitionToFileGroupsMap.get(partitionPathStr).stream();
        }

        try {
            // Create the path if it does not exist already
            Path partitionPath = new Path(metaClient.getBasePath(), partitionPathStr);
            FSUtils.createPathIfNotExists(fs, partitionPath);
            FileStatus[] statuses = fs.listStatus(partitionPath);
            List<HoodieFileGroup> fileGroups = addFilesToView(statuses);
            return fileGroups.stream();
        } catch (IOException e) {
            throw new HoodieIOException("Failed to list data files in partition " + partitionPathStr, e);
        }
    }
}