com.uber.hoodie.common.util.FSUtils.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.hoodie.common.util.FSUtils.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie.common.util;

import static com.uber.hoodie.common.table.HoodieTableMetaClient.MARKER_EXTN;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.uber.hoodie.common.model.HoodieFileFormat;
import com.uber.hoodie.common.model.HoodieLogFile;
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.collection.Pair;
import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.exception.HoodieIOException;
import com.uber.hoodie.exception.InvalidHoodiePathException;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.UUID;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

/**
 * Utility functions related to accessing the file storage
 */
public class FSUtils {

    private static final Logger LOG = LogManager.getLogger(FSUtils.class);
    // Log files are of this pattern - .b5068208-e1a4-11e6-bf01-fe55135034f3_20170101134598.log.1
    private static final Pattern LOG_FILE_PATTERN = Pattern
            .compile("\\.(.*)_(.*)\\.(.*)\\.([0-9]*)(_(([0-9]*)-([0-9]*)-([0-9]*)))?");
    private static final String LOG_FILE_PREFIX = ".";
    private static final int MAX_ATTEMPTS_RECOVER_LEASE = 10;
    private static final long MIN_CLEAN_TO_KEEP = 10;
    private static final long MIN_ROLLBACK_TO_KEEP = 10;
    private static final String HOODIE_ENV_PROPS_PREFIX = "HOODIE_ENV_";

    private static final PathFilter ALLOW_ALL_FILTER = new PathFilter() {
        @Override
        public boolean accept(Path file) {
            return true;
        }
    };

    public static Configuration prepareHadoopConf(Configuration conf) {
        conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
        conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());

        // look for all properties, prefixed to be picked up
        for (Entry<String, String> prop : System.getenv().entrySet()) {
            if (prop.getKey().startsWith(HOODIE_ENV_PROPS_PREFIX)) {
                LOG.info("Picking up value for hoodie env var :" + prop.getKey());
                conf.set(prop.getKey().replace(HOODIE_ENV_PROPS_PREFIX, "").replaceAll("_DOT_", "."),
                        prop.getValue());
            }
        }
        return conf;
    }

    public static FileSystem getFs(String path, Configuration conf) {
        FileSystem fs;
        conf = prepareHadoopConf(conf);
        try {
            fs = new Path(path).getFileSystem(conf);
        } catch (IOException e) {
            throw new HoodieIOException("Failed to get instance of " + FileSystem.class.getName(), e);
        }
        LOG.info(String.format("Hadoop Configuration: fs.defaultFS: [%s], Config:[%s], FileSystem: [%s]",
                conf.getRaw("fs.defaultFS"), conf.toString(), fs.toString()));
        return fs;
    }

    /**
     * A write token uniquely identifies an attempt at one of the IOHandle operations (Merge/Create/Append)
     */
    public static String makeWriteToken(int taskPartitionId, int stageId, long taskAttemptId) {
        return String.format("%d-%d-%d", taskPartitionId, stageId, taskAttemptId);
    }

    public static String makeDataFileName(String commitTime, String writeToken, String fileId) {
        return String.format("%s_%s_%s.parquet", fileId, writeToken, commitTime);
    }

    public static String makeMarkerFile(String commitTime, String writeToken, String fileId) {
        return String.format("%s_%s_%s%s", fileId, writeToken, commitTime, MARKER_EXTN);
    }

    public static String translateMarkerToDataPath(String basePath, String markerPath, String instantTs) {
        Preconditions.checkArgument(markerPath.endsWith(MARKER_EXTN));
        String markerRootPath = Path
                .getPathWithoutSchemeAndAuthority(new Path(
                        String.format("%s/%s/%s", basePath, HoodieTableMetaClient.TEMPFOLDER_NAME, instantTs)))
                .toString();
        int begin = markerPath.indexOf(markerRootPath);
        Preconditions.checkArgument(begin >= 0,
                "Not in marker dir. Marker Path=" + markerPath + ", Expected Marker Root=" + markerRootPath);
        String rPath = markerPath.substring(begin + markerRootPath.length() + 1);
        return String.format("%s/%s%s", basePath, rPath.replace(MARKER_EXTN, ""),
                HoodieFileFormat.PARQUET.getFileExtension());
    }

    public static String maskWithoutFileId(String commitTime, int taskPartitionId) {
        return String.format("*_%s_%s%s", taskPartitionId, commitTime, HoodieFileFormat.PARQUET.getFileExtension());
    }

    public static String getCommitFromCommitFile(String commitFileName) {
        return commitFileName.split("\\.")[0];
    }

    public static String getCommitTime(String fullFileName) {
        return fullFileName.split("_")[2].split("\\.")[0];
    }

    public static long getFileSize(FileSystem fs, Path path) throws IOException {
        return fs.getFileStatus(path).getLen();
    }

    public static String getFileId(String fullFileName) {
        return fullFileName.split("_")[0];
    }

    /**
     * Gets all partition paths assuming date partitioning (year, month, day) three levels down.
     */
    public static List<String> getAllPartitionFoldersThreeLevelsDown(FileSystem fs, String basePath)
            throws IOException {
        List<String> datePartitions = new ArrayList<>();
        // Avoid listing and including any folders under the metafolder
        PathFilter filter = getExcludeMetaPathFilter();
        FileStatus[] folders = fs.globStatus(new Path(basePath + "/*/*/*"), filter);
        for (FileStatus status : folders) {
            Path path = status.getPath();
            datePartitions.add(String.format("%s/%s/%s", path.getParent().getParent().getName(),
                    path.getParent().getName(), path.getName()));
        }
        return datePartitions;
    }

    /**
     * Given a base partition and a partition path, return
     * relative path of partition path to the base path
     */
    public static String getRelativePartitionPath(Path basePath, Path partitionPath) {
        basePath = Path.getPathWithoutSchemeAndAuthority(basePath);
        partitionPath = Path.getPathWithoutSchemeAndAuthority(partitionPath);
        String partitionFullPath = partitionPath.toString();
        int partitionStartIndex = partitionFullPath.indexOf(basePath.getName(),
                basePath.getParent() == null ? 0 : basePath.getParent().toString().length());
        // Partition-Path could be empty for non-partitioned tables
        return partitionStartIndex + basePath.getName().length() == partitionFullPath.length() ? ""
                : partitionFullPath.substring(partitionStartIndex + basePath.getName().length() + 1);
    }

    /**
     * Obtain all the partition paths, that are present in this table, denoted by presence of {@link
     * com.uber.hoodie.common.model.HoodiePartitionMetadata#HOODIE_PARTITION_METAFILE}
     */
    public static List<String> getAllFoldersWithPartitionMetaFile(FileSystem fs, String basePathStr)
            throws IOException {
        final Path basePath = new Path(basePathStr);
        final List<String> partitions = new ArrayList<>();
        processFiles(fs, basePathStr, (locatedFileStatus) -> {
            Path filePath = locatedFileStatus.getPath();
            if (filePath.getName().equals(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE)) {
                partitions.add(getRelativePartitionPath(basePath, filePath.getParent()));
            }
            return true;
        }, true);
        return partitions;
    }

    public static final List<String> getAllDataFilesForMarkers(FileSystem fs, String basePath, String instantTs,
            String markerDir) throws IOException {
        List<String> dataFiles = new LinkedList<>();
        processFiles(fs, markerDir, (status) -> {
            String pathStr = status.getPath().toString();
            if (pathStr.endsWith(MARKER_EXTN)) {
                dataFiles.add(FSUtils.translateMarkerToDataPath(basePath, pathStr, instantTs));
            }
            return true;
        }, false);
        return dataFiles;
    }

    /**
     * Recursively processes all files in the base-path. If excludeMetaFolder is set, the meta-folder and all its
     * subdirs are skipped
     * @param fs           File System
     * @param basePathStr  Base-Path
     * @param consumer     Callback for processing
     * @param excludeMetaFolder Exclude .hoodie folder
     * @throws IOException
     */
    @VisibleForTesting
    static void processFiles(FileSystem fs, String basePathStr, Function<FileStatus, Boolean> consumer,
            boolean excludeMetaFolder) throws IOException {
        PathFilter pathFilter = excludeMetaFolder ? getExcludeMetaPathFilter() : ALLOW_ALL_FILTER;
        FileStatus[] topLevelStatuses = fs.listStatus(new Path(basePathStr));
        for (int i = 0; i < topLevelStatuses.length; i++) {
            FileStatus child = topLevelStatuses[i];
            if (child.isFile()) {
                boolean success = consumer.apply(child);
                if (!success) {
                    throw new HoodieException("Failed to process file-status=" + child);
                }
            } else if (pathFilter.accept(child.getPath())) {
                RemoteIterator<LocatedFileStatus> itr = fs.listFiles(child.getPath(), true);
                while (itr.hasNext()) {
                    FileStatus status = itr.next();
                    boolean success = consumer.apply(status);
                    if (!success) {
                        throw new HoodieException("Failed to process file-status=" + status);
                    }
                }
            }
        }
    }

    public static List<String> getAllPartitionPaths(FileSystem fs, String basePathStr,
            boolean assumeDatePartitioning) throws IOException {
        if (assumeDatePartitioning) {
            return getAllPartitionFoldersThreeLevelsDown(fs, basePathStr);
        } else {
            return getAllFoldersWithPartitionMetaFile(fs, basePathStr);
        }
    }

    public static String getFileExtension(String fullName) {
        Preconditions.checkNotNull(fullName);
        String fileName = (new File(fullName)).getName();
        int dotIndex = fileName.indexOf('.');
        return dotIndex == -1 ? "" : fileName.substring(dotIndex);
    }

    private static PathFilter getExcludeMetaPathFilter() {
        // Avoid listing and including any folders under the metafolder
        return (path) -> {
            if (path.toString().contains(HoodieTableMetaClient.METAFOLDER_NAME)) {
                return false;
            }
            return true;
        };
    }

    public static String getInstantTime(String name) {
        return name.replace(getFileExtension(name), "");
    }

    /**
     * Returns a new unique prefix for creating a file group.
     */
    public static String createNewFileIdPfx() {
        return UUID.randomUUID().toString();
    }

    /**
     * Get the file extension from the log file
     */
    public static String getFileExtensionFromLog(Path logPath) {
        Matcher matcher = LOG_FILE_PATTERN.matcher(logPath.getName());
        if (!matcher.find()) {
            throw new InvalidHoodiePathException(logPath, "LogFile");
        }
        return matcher.group(3);
    }

    /**
     * Get the first part of the file name in the log file. That will be the fileId. Log file do not
     * have commitTime in the file name.
     */
    public static String getFileIdFromLogPath(Path path) {
        Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
        if (!matcher.find()) {
            throw new InvalidHoodiePathException(path, "LogFile");
        }
        return matcher.group(1);
    }

    /**
     * Check if the file is a parquet file of a log file. Then get the fileId appropriately.
     */
    public static String getFileIdFromFilePath(Path filePath) {
        if (FSUtils.isLogFile(filePath)) {
            return FSUtils.getFileIdFromLogPath(filePath);
        }
        return FSUtils.getFileId(filePath.getName());
    }

    /**
     * Get the first part of the file name in the log file. That will be the fileId. Log file do not
     * have commitTime in the file name.
     */
    public static String getBaseCommitTimeFromLogPath(Path path) {
        Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
        if (!matcher.find()) {
            throw new InvalidHoodiePathException(path, "LogFile");
        }
        return matcher.group(2);
    }

    /**
     * Get TaskId used in log-path
     */
    public static Integer getTaskPartitionIdFromLogPath(Path path) {
        Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
        if (!matcher.find()) {
            throw new InvalidHoodiePathException(path, "LogFile");
        }
        String val = matcher.group(7);
        return val == null ? null : Integer.parseInt(val);
    }

    /**
     * Get Write-Token used in log-path
     */
    public static String getWriteTokenFromLogPath(Path path) {
        Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
        if (!matcher.find()) {
            throw new InvalidHoodiePathException(path, "LogFile");
        }
        return matcher.group(6);
    }

    /**
     * Get StageId used in log-path
     */
    public static Integer getStageIdFromLogPath(Path path) {
        Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
        if (!matcher.find()) {
            throw new InvalidHoodiePathException(path, "LogFile");
        }
        String val = matcher.group(8);
        return val == null ? null : Integer.parseInt(val);
    }

    /**
     * Get Task Attempt Id used in log-path
     */
    public static Integer getTaskAttemptIdFromLogPath(Path path) {
        Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
        if (!matcher.find()) {
            throw new InvalidHoodiePathException(path, "LogFile");
        }
        String val = matcher.group(9);
        return val == null ? null : Integer.parseInt(val);
    }

    /**
     * Get the last part of the file name in the log file and convert to int.
     */
    public static int getFileVersionFromLog(Path logPath) {
        Matcher matcher = LOG_FILE_PATTERN.matcher(logPath.getName());
        if (!matcher.find()) {
            throw new InvalidHoodiePathException(logPath, "LogFile");
        }
        return Integer.parseInt(matcher.group(4));
    }

    public static String makeLogFileName(String fileId, String logFileExtension, String baseCommitTime, int version,
            String writeToken) {
        String suffix = (writeToken == null)
                ? String.format("%s_%s%s.%d", fileId, baseCommitTime, logFileExtension, version)
                : String.format("%s_%s%s.%d_%s", fileId, baseCommitTime, logFileExtension, version, writeToken);
        return LOG_FILE_PREFIX + suffix;
    }

    public static boolean isLogFile(Path logPath) {
        Matcher matcher = LOG_FILE_PATTERN.matcher(logPath.getName());
        if (!matcher.find()) {
            return false;
        }
        return true;
    }

    /**
     * Get the latest log file written from the list of log files passed in
     */
    public static Optional<HoodieLogFile> getLatestLogFile(Stream<HoodieLogFile> logFiles) {
        return logFiles.sorted(HoodieLogFile.getReverseLogFileComparator()).findFirst();
    }

    /**
     * Get all the log files for the passed in FileId in the partition path
     */
    public static Stream<HoodieLogFile> getAllLogFiles(FileSystem fs, Path partitionPath, final String fileId,
            final String logFileExtension, final String baseCommitTime) throws IOException {
        return Arrays
                .stream(fs.listStatus(partitionPath,
                        path -> path.getName().startsWith("." + fileId)
                                && path.getName().contains(logFileExtension)))
                .map(HoodieLogFile::new).filter(s -> s.getBaseCommitTime().equals(baseCommitTime));
    }

    /**
     * Get the latest log version for the fileId in the partition path
     */
    public static Optional<Pair<Integer, String>> getLatestLogVersion(FileSystem fs, Path partitionPath,
            final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException {
        Optional<HoodieLogFile> latestLogFile = getLatestLogFile(
                getAllLogFiles(fs, partitionPath, fileId, logFileExtension, baseCommitTime));
        if (latestLogFile.isPresent()) {
            return Optional.of(Pair.of(latestLogFile.get().getLogVersion(),
                    getWriteTokenFromLogPath(latestLogFile.get().getPath())));
        }
        return Optional.empty();
    }

    /**
     * computes the next log version for the specified fileId in the partition path
     */
    public static int computeNextLogVersion(FileSystem fs, Path partitionPath, final String fileId,
            final String logFileExtension, final String baseCommitTime) throws IOException {
        Optional<Pair<Integer, String>> currentVersionWithWriteToken = getLatestLogVersion(fs, partitionPath,
                fileId, logFileExtension, baseCommitTime);
        // handle potential overflow
        return (currentVersionWithWriteToken.isPresent()) ? currentVersionWithWriteToken.get().getKey() + 1
                : HoodieLogFile.LOGFILE_BASE_VERSION;
    }

    public static int getDefaultBufferSize(final FileSystem fs) {
        return fs.getConf().getInt("io.file.buffer.size", 4096);
    }

    public static Short getDefaultReplication(FileSystem fs, Path path) {
        return fs.getDefaultReplication(path);
    }

    /**
     * When a file was opened and the task died without closing the stream, another task executor
     * cannot open because the existing lease will be active. We will try to recover the lease, from
     * HDFS. If a data node went down, it takes about 10 minutes for the lease to be rocovered. But if
     * the client dies, this should be instant.
     */
    public static boolean recoverDFSFileLease(final DistributedFileSystem dfs, final Path p)
            throws IOException, InterruptedException {
        LOG.info("Recover lease on dfs file " + p);
        // initiate the recovery
        boolean recovered = false;
        for (int nbAttempt = 0; nbAttempt < MAX_ATTEMPTS_RECOVER_LEASE; nbAttempt++) {
            LOG.info("Attempt " + nbAttempt + " to recover lease on dfs file " + p);
            recovered = dfs.recoverLease(p);
            if (recovered) {
                break;
            }
            // Sleep for 1 second before trying again. Typically it takes about 2-3 seconds to recover
            // under default settings
            Thread.sleep(1000);
        }
        return recovered;
    }

    public static void deleteOlderCleanMetaFiles(FileSystem fs, String metaPath, Stream<HoodieInstant> instants) {
        //TODO - this should be archived when archival is made general for all meta-data
        // skip MIN_CLEAN_TO_KEEP and delete rest
        instants.skip(MIN_CLEAN_TO_KEEP).map(s -> {
            try {
                return fs.delete(new Path(metaPath, s.getFileName()), false);
            } catch (IOException e) {
                throw new HoodieIOException("Could not delete clean meta files" + s.getFileName(), e);
            }
        });
    }

    public static void deleteOlderRollbackMetaFiles(FileSystem fs, String metaPath,
            Stream<HoodieInstant> instants) {
        //TODO - this should be archived when archival is made general for all meta-data
        // skip MIN_ROLLBACK_TO_KEEP and delete rest
        instants.skip(MIN_ROLLBACK_TO_KEEP).map(s -> {
            try {
                return fs.delete(new Path(metaPath, s.getFileName()), false);
            } catch (IOException e) {
                throw new HoodieIOException("Could not delete rollback meta files " + s.getFileName(), e);
            }
        });
    }

    public static void deleteOlderRestoreMetaFiles(FileSystem fs, String metaPath, Stream<HoodieInstant> instants) {
        //TODO - this should be archived when archival is made general for all meta-data
        // skip MIN_ROLLBACK_TO_KEEP and delete rest
        instants.skip(MIN_ROLLBACK_TO_KEEP).map(s -> {
            try {
                return fs.delete(new Path(metaPath, s.getFileName()), false);
            } catch (IOException e) {
                throw new HoodieIOException("Could not delete restore meta files " + s.getFileName(), e);
            }
        });
    }

    public static void createPathIfNotExists(FileSystem fs, Path partitionPath) throws IOException {
        if (!fs.exists(partitionPath)) {
            fs.mkdirs(partitionPath);
        }
    }

    public static Long getSizeInMB(long sizeInBytes) {
        return sizeInBytes / (1024 * 1024);
    }

    public static Path getPartitionPath(String basePath, String partitionPath) {
        return getPartitionPath(new Path(basePath), partitionPath);
    }

    public static Path getPartitionPath(Path basePath, String partitionPath) {
        // FOr non-partitioned table, return only base-path
        return ((partitionPath == null) || (partitionPath.isEmpty())) ? basePath
                : new Path(basePath, partitionPath);
    }
}