org.apache.impala.common.FileSystemUtil.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.impala.common.FileSystemUtil.java

Source

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.impala.common;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.UUID;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.s3a.S3AFileSystem;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.client.HdfsAdmin;
import org.apache.hadoop.hdfs.protocol.EncryptionZone;
import org.apache.impala.catalog.HdfsCompression;
import org.apache.log4j.Logger;

import com.google.common.base.Objects;
import com.google.common.base.Preconditions;

/**
 * Common utility functions for operating on FileSystem objects.
 */
public class FileSystemUtil {
    private static final Configuration CONF = new Configuration();
    private static final Logger LOG = Logger.getLogger(FileSystemUtil.class);

    /**
     * Performs a non-recursive delete of all visible (non-hidden) files in a given
     * directory. Returns the number of files deleted as part of this operation.
     */
    public static int deleteAllVisibleFiles(Path directory) throws IOException {
        FileSystem fs = directory.getFileSystem(CONF);
        Preconditions.checkState(fs.getFileStatus(directory).isDirectory());
        int numFilesDeleted = 0;
        for (FileStatus fStatus : fs.listStatus(directory)) {
            // Only delete files that are not hidden.
            if (fStatus.isFile() && !isHiddenFile(fStatus.getPath().getName())) {
                if (LOG.isTraceEnabled())
                    LOG.trace("Removing: " + fStatus.getPath());
                fs.delete(fStatus.getPath(), false);
                ++numFilesDeleted;
            }
        }
        return numFilesDeleted;
    }

    /**
     * Returns the total number of visible (non-hidden) files in a directory.
     */
    public static int getTotalNumVisibleFiles(Path directory) throws IOException {
        FileSystem fs = directory.getFileSystem(CONF);
        Preconditions.checkState(fs.getFileStatus(directory).isDirectory());
        int numFiles = 0;
        for (FileStatus fStatus : fs.listStatus(directory)) {
            // Only delete files that are not hidden.
            if (fStatus.isFile() && !isHiddenFile(fStatus.getPath().getName())) {
                ++numFiles;
            }
        }
        return numFiles;
    }

    /**
     * Returns true if path p1 and path p2 are in the same encryption zone in HDFS.
     * Returns false if they are in different encryption zones or if either of the paths
     * are not on HDFS.
     */
    private static boolean arePathsInSameHdfsEncryptionZone(FileSystem fs, Path p1, Path p2) throws IOException {
        // Only distributed file systems have encryption zones.
        if (!isDistributedFileSystem(p1) || !isDistributedFileSystem(p2))
            return false;
        HdfsAdmin hdfsAdmin = new HdfsAdmin(fs.getUri(), CONF);
        EncryptionZone z1 = hdfsAdmin.getEncryptionZoneForPath(p1);
        EncryptionZone z2 = hdfsAdmin.getEncryptionZoneForPath(p2);
        if (z1 == null && z2 == null)
            return true;
        if (z1 == null || z2 == null)
            return false;
        return z1.equals(z2);
    }

    /**
     * Relocates all visible (non-hidden) files from a source directory to a destination
     * directory. Files are moved (renamed) to the new location unless the source and
     * destination directories are in different encryption zones, in which case the files
     * are copied so that they are decrypted and/or encrypted. Naming conflicts are
     * resolved by appending a UUID to the base file name. Any sub-directories within the
     * source directory are skipped. Returns the number of files relocated as part of this
     * operation.
     */
    public static int relocateAllVisibleFiles(Path sourceDir, Path destDir) throws IOException {
        FileSystem destFs = destDir.getFileSystem(CONF);
        FileSystem sourceFs = sourceDir.getFileSystem(CONF);
        Preconditions.checkState(destFs.isDirectory(destDir));
        Preconditions.checkState(sourceFs.isDirectory(sourceDir));

        // Use the same UUID to resolve all file name conflicts. This helps mitigate problems
        // that might happen if there is a conflict moving a set of files that have
        // dependent file names. For example, foo.lzo and foo.lzo_index.
        UUID uuid = UUID.randomUUID();

        // Enumerate all the files in the source
        int numFilesMoved = 0;
        for (FileStatus fStatus : sourceFs.listStatus(sourceDir)) {
            if (fStatus.isDirectory()) {
                if (LOG.isTraceEnabled()) {
                    LOG.trace("Skipping copy of directory: " + fStatus.getPath());
                }
                continue;
            } else if (isHiddenFile(fStatus.getPath().getName())) {
                continue;
            }

            Path destFile = new Path(destDir, fStatus.getPath().getName());
            if (destFs.exists(destFile)) {
                destFile = new Path(destDir, appendToBaseFileName(destFile.getName(), uuid.toString()));
            }
            FileSystemUtil.relocateFile(fStatus.getPath(), destFile, false);
            ++numFilesMoved;
        }
        return numFilesMoved;
    }

    /**
     * Relocates the given file to a new location (either another directory or a
     * file in the same or different filesystem). The file is generally moved (renamed) to
     * the new location. However, the file is copied if the source and destination are in
     * different encryption zones so that the file can be decrypted and/or encrypted, or if
     * the source and destination are in different filesystems. If renameIfAlreadyExists is
     * true, no error will be thrown if a file with the same name already exists in the
     * destination location. Instead, a UUID will be appended to the base file name,
     * preserving the existing file extension. If renameIfAlreadyExists is false, an
     * IOException will be thrown if there is a file name conflict.
     */
    public static void relocateFile(Path sourceFile, Path dest, boolean renameIfAlreadyExists) throws IOException {
        FileSystem destFs = dest.getFileSystem(CONF);
        FileSystem sourceFs = sourceFile.getFileSystem(CONF);

        Path destFile = destFs.isDirectory(dest) ? new Path(dest, sourceFile.getName()) : dest;
        // If a file with the same name does not already exist in the destination location
        // then use the same file name. Otherwise, generate a unique file name.
        if (renameIfAlreadyExists && destFs.exists(destFile)) {
            Path destDir = destFs.isDirectory(dest) ? dest : dest.getParent();
            destFile = new Path(destDir, appendToBaseFileName(destFile.getName(), UUID.randomUUID().toString()));
        }
        boolean sameFileSystem = isPathOnFileSystem(sourceFile, destFs);
        boolean destIsDfs = isDistributedFileSystem(destFs);

        // If the source and the destination are on different file systems, or in different
        // encryption zones, files can't be moved from one location to the other and must be
        // copied instead.
        boolean sameEncryptionZone = arePathsInSameHdfsEncryptionZone(destFs, sourceFile, destFile);
        // We can do a rename if the src and dst are in the same encryption zone in the same
        // distributed filesystem.
        boolean doRename = destIsDfs && sameFileSystem && sameEncryptionZone;
        // Alternatively, we can do a rename if the src and dst are on the same
        // non-distributed filesystem.
        if (!doRename)
            doRename = !destIsDfs && sameFileSystem;
        if (doRename) {
            if (LOG.isTraceEnabled()) {
                LOG.trace(String.format("Moving '%s' to '%s'", sourceFile.toString(), destFile.toString()));
            }
            // Move (rename) the file.
            destFs.rename(sourceFile, destFile);
            return;
        }
        if (destIsDfs && sameFileSystem) {
            Preconditions.checkState(!doRename);
            // We must copy rather than move if the source and dest are in different
            // encryption zones. A move would return an error from the NN because a move is a
            // metadata-only operation and the files would not be encrypted/decrypted properly
            // on the DNs.
            if (LOG.isTraceEnabled()) {
                LOG.trace(String.format("Copying source '%s' to '%s' because HDFS encryption zones are different.",
                        sourceFile, destFile));
            }
        } else {
            Preconditions.checkState(!sameFileSystem);
            if (LOG.isTraceEnabled()) {
                LOG.trace(String.format("Copying '%s' to '%s' between filesystems.", sourceFile, destFile));
            }
        }
        FileUtil.copy(sourceFs, sourceFile, destFs, destFile, true, true, CONF);
    }

    /**
     * Reads the file at path and returns the contents.
     */
    public static String readFile(Path file) throws IOException {
        FileSystem fs = file.getFileSystem(CONF);
        InputStream fileStream = fs.open(file);
        try {
            return IOUtils.toString(fileStream);
        } finally {
            IOUtils.closeQuietly(fileStream);
        }
    }

    /**
     * Builds a new file name based on a base file name. This is done by inserting
     * the given appendStr into the base file name, preserving the file extension (if
     * one exists).
     * For example, this could be passed a UUID string to uniquify files:
     * file1.snap -> file1_<uuid>.snap
     * file1 -> file1_<uuid>
     */
    private static String appendToBaseFileName(String baseFileName, String appendStr) {
        StringBuilder sb = new StringBuilder(baseFileName);
        // Insert the string to append, preserving the file extension.
        int extensionIdx = baseFileName.lastIndexOf('.');
        if (extensionIdx != -1) {
            sb.replace(extensionIdx, extensionIdx + 1, "_" + appendStr + ".");
        } else {
            sb.append("_" + appendStr);
        }
        return sb.toString();
    }

    /**
     * Returns true if the given Path contains any visible sub directories, otherwise false.
     */
    public static boolean containsVisibleSubdirectory(Path directory) throws FileNotFoundException, IOException {
        FileSystem fs = directory.getFileSystem(CONF);
        // Enumerate all the files in the source
        for (FileStatus fStatus : fs.listStatus(directory)) {
            String pathName = fStatus.getPath().getName();
            if (fStatus.isDirectory() && !isHiddenFile(pathName)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Makes a temporary unique directory within the given directory.
     */
    public static Path makeTmpSubdirectory(Path directory) throws IOException {
        FileSystem fs = directory.getFileSystem(CONF);
        Path tmpDir = new Path(directory, ".tmp_" + UUID.randomUUID().toString());
        fs.mkdirs(tmpDir);
        return tmpDir;
    }

    public static boolean isHiddenFile(String fileName) {
        // Hidden files start with '.' or '_'. The '.copying' suffix is used by some
        // filesystem utilities (e.g. hdfs put) as a temporary destination when copying
        // files. The '.tmp' suffix is Flume's default for temporary files.
        String lcFileName = fileName.toLowerCase();
        return lcFileName.startsWith(".") || lcFileName.startsWith("_") || lcFileName.endsWith(".copying")
                || lcFileName.endsWith(".tmp");
    }

    /**
     * Returns true if the file corresponding to 'fileStatus' is a valid data file as
     * per Impala's partitioning rules. A fileStatus is considered invalid if its a
     * directory/hidden file/LZO index file. LZO index files are skipped because they are
     * read by the scanner directly. Currently Impala doesn't allow subdirectories in the
     * partition paths.
     */
    public static boolean isValidDataFile(FileStatus fileStatus) {
        String fileName = fileStatus.getPath().getName();
        return !(fileStatus.isDirectory() || FileSystemUtil.isHiddenFile(fileName)
                || HdfsCompression.fromFileName(fileName) == HdfsCompression.LZO_INDEX);
    }

    /**
     * Returns true if the filesystem supports storage UUIDs in BlockLocation calls.
     */
    public static boolean supportsStorageIds(FileSystem fs) {
        // Common case.
        if (isDistributedFileSystem(fs))
            return true;
        // Blacklist FileSystems that are known to not to include storage UUIDs.
        return !(fs instanceof S3AFileSystem || fs instanceof LocalFileSystem);
    }

    /**
     * Returns true iff the filesystem is a S3AFileSystem.
     */
    public static boolean isS3AFileSystem(FileSystem fs) {
        return fs instanceof S3AFileSystem;
    }

    /**
     * Returns true iff the path is on a S3AFileSystem.
     */
    public static boolean isS3AFileSystem(Path path) throws IOException {
        return isS3AFileSystem(path.getFileSystem(CONF));
    }

    /**
     * Returns true iff the filesystem is an instance of LocalFileSystem.
     */
    public static boolean isLocalFileSystem(FileSystem fs) {
        return fs instanceof LocalFileSystem;
    }

    /**
     * Return true iff path is on a local filesystem.
     */
    public static boolean isLocalFileSystem(Path path) throws IOException {
        return isLocalFileSystem(path.getFileSystem(CONF));
    }

    /**
     * Returns true iff the filesystem is a DistributedFileSystem.
     */
    public static boolean isDistributedFileSystem(FileSystem fs) {
        return fs instanceof DistributedFileSystem;
    }

    /**
     * Return true iff path is on a DFS filesystem.
     */
    public static boolean isDistributedFileSystem(Path path) throws IOException {
        return isDistributedFileSystem(path.getFileSystem(CONF));
    }

    public static FileSystem getDefaultFileSystem() throws IOException {
        Path path = new Path(FileSystem.getDefaultUri(CONF));
        FileSystem fs = path.getFileSystem(CONF);
        return fs;
    }

    public static DistributedFileSystem getDistributedFileSystem() throws IOException {
        FileSystem fs = getDefaultFileSystem();
        Preconditions.checkState(fs instanceof DistributedFileSystem);
        return (DistributedFileSystem) fs;
    }

    /**
     * Fully-qualifies the given path based on the FileSystem configuration.
     */
    public static Path createFullyQualifiedPath(Path location) {
        URI defaultUri = FileSystem.getDefaultUri(CONF);
        URI locationUri = location.toUri();
        // Use the default URI only if location has no scheme or it has the same scheme as
        // the default URI.  Otherwise, Path.makeQualified() will incorrectly use the
        // authority from the default URI even though the schemes don't match.  See HDFS-7031.
        if (locationUri.getScheme() == null || locationUri.getScheme().equalsIgnoreCase(defaultUri.getScheme())) {
            return location.makeQualified(defaultUri, location);
        }
        // Already qualified (has scheme).
        return location;
    }

    /**
     * Return true iff the path is on the given filesystem.
     */
    public static boolean isPathOnFileSystem(Path path, FileSystem fs) {
        try {
            // Call makeQualified() for the side-effect of FileSystem.checkPath() which will
            // throw an exception if path is not on fs.
            fs.makeQualified(path);
            return true;
        } catch (IllegalArgumentException e) {
            // Path is not on fs.
            return false;
        }
    }

    /**
     * Copies the source file to a destination path on the local filesystem.
     * Throws IOException on failure.
     */
    public static void copyToLocal(Path source, Path dest) throws IOException {
        FileSystem fs = source.getFileSystem(CONF);
        fs.copyToLocalFile(source, dest);
    }

    /**
     * Delete the file at 'path' if it exists.
     */
    public static void deleteIfExists(Path path) {
        try {
            FileSystem fs = path.getFileSystem(CONF);
            if (!fs.exists(path))
                return;
            fs.delete(path);
        } catch (IOException e) {
            LOG.warn("Encountered an exception deleting file at path " + path.toString(), e);
        }
    }

    /**
     * Returns true if the given path is a location which supports caching (e.g. HDFS).
     */
    public static boolean isPathCacheable(Path path) {
        try {
            return isDistributedFileSystem(path);
        } catch (IOException e) {
            return false;
        }
    }

    /**
     * Returns true if Path 'p' is a descendant of Path 'parent', false otherwise.
     * This function relies on Path.equals() which requires paths to have the same
     * schema and authority to compare equal. So both 'p' and 'parent' should either
     * be qualified or unqualified paths for this function to behave as expected.
     */
    public static boolean isDescendantPath(Path p, Path parent) {
        if (p == null || parent == null)
            return false;
        while (!p.isRoot() && p.depth() != parent.depth())
            p = p.getParent();
        if (p.isRoot())
            return false;
        boolean result = p.equals(parent);
        if (!result && LOG.isTraceEnabled()) {
            // Add a message to the log if 'p' and 'parent' have inconsistent qualification.
            URI pUri = p.toUri();
            URI parentUri = parent.toUri();
            boolean sameScheme = Objects.equal(pUri.getScheme(), parentUri.getScheme());
            boolean sameAuthority = Objects.equal(pUri.getAuthority(), parentUri.getAuthority());
            if (!sameScheme || !sameAuthority) {
                LOG.trace("Inconsistent schema or authority for paths: " + p.toString() + " " + parent.toString());
            }
        }
        return result;
    }

    /**
     * Returns the configuration.
     */
    public static Configuration getConfiguration() {
        return CONF;
    }

    /**
     * Returns true iff the given location is on a filesystem that Impala can write to.
     */
    public static boolean isImpalaWritableFilesystem(String location) throws IOException {
        Path path = new Path(location);
        return (FileSystemUtil.isDistributedFileSystem(path) || FileSystemUtil.isLocalFileSystem(path)
                || FileSystemUtil.isS3AFileSystem(path));
    }
}