com.netflix.genie.common.internal.dto.JobDirectoryManifest.java Source code

Java tutorial

Introduction

Here is the source code for com.netflix.genie.common.internal.dto.JobDirectoryManifest.java

Source

/*
 *
 *  Copyright 2019 Netflix, Inc.
 *
 *     Licensed under the Apache License, Version 2.0 (the "License");
 *     you may not use this file except in compliance with the License.
 *     You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 *     Unless required by applicable law or agreed to in writing, software
 *     distributed under the License is distributed on an "AS IS" BASIS,
 *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *     See the License for the specific language governing permissions and
 *     limitations under the License.
 *
 */
package com.netflix.genie.common.internal.dto;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonGetter;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import com.netflix.genie.common.internal.jobs.JobConstants;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.ToString;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;

import javax.annotation.Nullable;
import javax.validation.constraints.Min;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.AccessDeniedException;
import java.nio.file.DirectoryStream;
import java.nio.file.FileSystemLoopException;
import java.nio.file.FileVisitOption;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.BasicFileAttributes;
import java.time.Instant;
import java.util.Collection;
import java.util.EnumSet;
import java.util.Optional;
import java.util.Set;

/**
 * A manifest of all the files and subdirectories in a Genie job working directory.
 *
 * @author tgianos
 * @since 4.0.0
 */
@ToString(doNotUseGetters = true)
@EqualsAndHashCode(doNotUseGetters = true)
public class JobDirectoryManifest {
    private static final String ENTRIES_KEY = "entries";
    private static final String EMPTY_STRING = "";

    private final ImmutableMap<String, ManifestEntry> entries;
    private final ImmutableSet<ManifestEntry> files;
    private final ImmutableSet<ManifestEntry> directories;
    private final int numFiles;
    private final int numDirectories;
    private final long totalSizeOfFiles;

    /**
     * Create a manifest from the given job directory.
     *
     * @param directory The job directory to create a manifest from
     * @throws IOException If there is an error reading the directory
     */
    public JobDirectoryManifest(final Path directory) throws IOException {
        this(directory, true);
    }

    /**
     * Create a manifest from the given job directory.
     *
     * @param directory              The job directory to create a manifest from
     * @param calculateFileChecksums Whether or not to calculate checksums for each file added to the manifest
     * @throws IOException If there is an error reading the directory
     */
    public JobDirectoryManifest(final Path directory, final boolean calculateFileChecksums) throws IOException {
        // Walk the directory
        final ImmutableMap.Builder<String, ManifestEntry> builder = ImmutableMap.builder();
        final ManifestVisitor manifestVisitor = new ManifestVisitor(directory, builder, calculateFileChecksums);
        final EnumSet<FileVisitOption> options = EnumSet.of(FileVisitOption.FOLLOW_LINKS);
        Files.walkFileTree(directory, options, Integer.MAX_VALUE, manifestVisitor);
        this.entries = builder.build();

        final ImmutableSet.Builder<ManifestEntry> filesBuilder = ImmutableSet.builder();
        final ImmutableSet.Builder<ManifestEntry> directoriesBuilder = ImmutableSet.builder();

        long sizeOfFiles = 0L;
        for (final ManifestEntry entry : this.entries.values()) {
            if (entry.isDirectory()) {
                directoriesBuilder.add(entry);
            } else {
                filesBuilder.add(entry);
                sizeOfFiles += entry.getSize();
            }
        }

        this.totalSizeOfFiles = sizeOfFiles;
        this.directories = directoriesBuilder.build();
        this.files = filesBuilder.build();
        this.numDirectories = this.directories.size();
        this.numFiles = this.files.size();
    }

    /**
     * Create a manifest from an existing set of entries. Generally this should be used to regenerate an in memory
     * manifest instance from JSON.
     *
     * @param entries The entries in this manifest
     */
    @JsonCreator
    public JobDirectoryManifest(
            @JsonProperty(value = ENTRIES_KEY, required = true) final Set<ManifestEntry> entries) {
        final ImmutableMap.Builder<String, ManifestEntry> builder = ImmutableMap.builder();
        final ImmutableSet.Builder<ManifestEntry> filesBuilder = ImmutableSet.builder();
        final ImmutableSet.Builder<ManifestEntry> directoriesBuilder = ImmutableSet.builder();

        long sizeOfFiles = 0L;
        for (final ManifestEntry entry : entries) {
            builder.put(entry.getPath(), entry);
            if (entry.isDirectory()) {
                directoriesBuilder.add(entry);
            } else {
                filesBuilder.add(entry);
                sizeOfFiles += entry.getSize();
            }
        }
        this.entries = builder.build();
        this.totalSizeOfFiles = sizeOfFiles;
        this.directories = directoriesBuilder.build();
        this.files = filesBuilder.build();
        this.numDirectories = this.directories.size();
        this.numFiles = this.files.size();
    }

    /**
     * Check whether an entry exists for the given path.
     *
     * @param path The path to check. Relative to the root of the original job directory.
     * @return {@code true} if an entry exists for this path
     */
    public boolean hasEntry(final String path) {
        return this.entries.containsKey(path);
    }

    /**
     * Get the entry, if one exists, for the given path.
     *
     * @param path The path to get an entry for. Relative to the root of the original job directory.
     * @return The entry wrapped in an {@link Optional} or {@link Optional#empty()} if no entry exists
     */
    @JsonIgnore
    public Optional<ManifestEntry> getEntry(final String path) {
        return Optional.ofNullable(this.entries.get(path));
    }

    /**
     * A getter used to mask internal implementation for JSON serialization.
     *
     * @return All the entries as a collection.
     */
    @JsonGetter(ENTRIES_KEY)
    Collection<ManifestEntry> getEntries() {
        return this.entries.values();
    }

    /**
     * Get all the entries that are files for this manifest.
     *
     * @return All the file {@link ManifestEntry}'s as an immutable set.
     */
    @JsonIgnore
    public Set<ManifestEntry> getFiles() {
        return this.files;
    }

    /**
     * Get all the entries that are directories for this manifest.
     *
     * @return All the directory {@link ManifestEntry}'s as an immutable set.
     */
    @JsonIgnore
    public Set<ManifestEntry> getDirectories() {
        return this.directories;
    }

    /**
     * Get the total number of files in this manifest.
     *
     * @return The total number of files that are in this job directory
     */
    @JsonIgnore
    public int getNumFiles() {
        return this.numFiles;
    }

    /**
     * Get the total number of directories in this manifest.
     *
     * @return The total number of sub directories that are in this job directory
     */
    @JsonIgnore
    public int getNumDirectories() {
        return this.numDirectories;
    }

    /**
     * Get the total size of the files contained in this manifest.
     *
     * @return The total size (in bytes) of all the files in this job directory
     */
    @JsonIgnore
    public long getTotalSizeOfFiles() {
        return this.totalSizeOfFiles;
    }

    @Slf4j
    private static class ManifestVisitor extends SimpleFileVisitor<Path> {

        private final Path root;
        private final ImmutableMap.Builder<String, ManifestEntry> builder;
        private final Metadata metadata;
        private final TikaConfig tikaConfig;
        private final boolean checksumFiles;

        ManifestVisitor(final Path root, final ImmutableMap.Builder<String, ManifestEntry> builder,
                final boolean checksumFiles) throws IOException {
            this.root = root;
            this.builder = builder;
            this.checksumFiles = checksumFiles;
            this.metadata = new Metadata();
            try {
                this.tikaConfig = new TikaConfig();
            } catch (final TikaException te) {
                log.error("Unable to create Tika Configuration due to error", te);
                throw new IOException(te);
            }
        }

        /**
         * {@inheritDoc}
         */
        @Override
        public FileVisitResult preVisitDirectory(final Path dir, final BasicFileAttributes attrs)
                throws IOException {
            final ManifestEntry entry = this.buildEntry(dir, attrs, true);
            log.debug("Created manifest entry for directory {}", entry);
            this.builder.put(entry.getPath(), entry);

            // Temporary hack To mitigate an ongoing issue.
            // Building manifests for deep application dependencies directory trees is putting a strain on the
            // application and causing user errors.
            // Until a proper fix is in place, skip the dependencies sub-trees.
            if (JobConstants.DEPENDENCY_FILE_PATH_PREFIX.equals(entry.getName())) {
                return FileVisitResult.SKIP_SUBTREE;
            }
            return FileVisitResult.CONTINUE;
        }

        /**
         * {@inheritDoc}
         */
        @Override
        public FileVisitResult visitFile(final Path file, final BasicFileAttributes attrs) throws IOException {
            final ManifestEntry entry = this.buildEntry(file, attrs, false);
            log.debug("Created manifest entry for file {}", entry);
            this.builder.put(entry.getPath(), entry);
            return FileVisitResult.CONTINUE;
        }

        /**
         * {@inheritDoc}
         */
        @Override
        public FileVisitResult visitFileFailed(final Path file, final IOException ioe) {
            if (ioe instanceof FileSystemLoopException) {
                log.warn("Detected file system cycle visiting while visiting {}. Skipping.", file, ioe);
                return FileVisitResult.SKIP_SUBTREE;
            } else if (ioe instanceof AccessDeniedException) {
                log.warn("Access denied for file {}. Skipping", file, ioe);
                return FileVisitResult.SKIP_SUBTREE;
            } else {
                log.error("Got unknown error {} while visiting {}. Terminating visitor", ioe.getMessage(), file,
                        ioe);
                // TODO: Not sure if we should do this or skip subtree or just continue and ignore it?
                return FileVisitResult.TERMINATE;
            }
        }

        private ManifestEntry buildEntry(final Path entry, final BasicFileAttributes attributes,
                final boolean directory) throws IOException {
            final String path = this.root.relativize(entry).toString();
            final Path fileName = entry.getFileName();
            final String name = fileName == null ? EMPTY_STRING : fileName.toString();
            final Instant lastModifiedTime = attributes.lastModifiedTime().toInstant();
            final Instant lastAccessTime = attributes.lastAccessTime().toInstant();
            final Instant creationTime = attributes.creationTime().toInstant();
            final long size = attributes.size();

            String md5 = null;
            String mimeType = null;
            if (!directory) {
                if (this.checksumFiles) {
                    try (InputStream data = Files.newInputStream(entry, StandardOpenOption.READ)) {
                        md5 = DigestUtils.md5Hex(data);
                    } catch (final IOException ioe) {
                        // For now MD5 isn't critical or required so we'll swallow errors here
                        log.error("Unable to create MD5 for {} due to error", entry, ioe);
                    }
                }

                mimeType = this.getMimeType(name, entry);
            }

            final Set<String> children = Sets.newHashSet();
            if (directory) {
                try (DirectoryStream<Path> directoryStream = Files.newDirectoryStream(entry)) {
                    for (final Path child : directoryStream) {
                        children.add(this.root.relativize(child).toString());
                    }
                }
            }

            String parent = null;
            if (StringUtils.isNotEmpty(path)) {
                // Not the root
                parent = this.root.relativize(entry.getParent()).toString();
            }

            return new ManifestEntry(path, name, lastModifiedTime, lastAccessTime, creationTime, directory, size,
                    md5, mimeType, parent, children);
        }

        private String getMimeType(final String name, final Path path) {
            // TODO: Move configuration of special handling cases to external configuration for flexibility
            //       probably a map of filename -> type or extension -> type or produced mime-type -> desired mime-type
            switch (name) {
            case "stdout":
            case "stderr":
            case "run":
                return MediaType.TEXT_PLAIN.toString();
            default:
                try (TikaInputStream inputStream = TikaInputStream.get(path)) {
                    return this.tikaConfig.getDetector().detect(inputStream, this.metadata).toString();
                } catch (final IOException ioe) {
                    log.error("Unable to detect mime type for {} due to error", path, ioe);
                    return MediaType.OCTET_STREAM.toString();
                }
            }
        }
    }

    /**
     * Representation of the metadata for a job file on a given underlying storage system.
     *
     * @author tgianos
     * @since 4.0.0
     */
    @Getter
    @ToString(doNotUseGetters = true)
    @EqualsAndHashCode(doNotUseGetters = true)
    public static class ManifestEntry {
        private final String path;
        private final String name;
        private final Instant lastModifiedTime;
        private final Instant lastAccessTime;
        private final Instant creationTime;
        private final boolean directory;
        @Min(value = 0L, message = "A file can't have a negative size")
        private final long size;
        private final String md5;
        private final String mimeType;
        private final String parent;
        private final Set<String> children;

        /**
         * Constructor.
         *
         * @param path             The relative path to the entry from the root of the job directory
         * @param name             The name of the entry
         * @param lastModifiedTime The time the entry was last modified
         * @param lastAccessTime   The time the entry was last accessed
         * @param creationTime     The time the entry was created
         * @param directory        Whether this entry is a directory or not
         * @param size             The current size of the entry within the storage system in bytes. Min 0
         * @param md5              The md5 hex of the file contents if it's not a directory
         * @param mimeType         The mime type of the file. Null if its a directory
         * @param parent           Optional entry for the path of this entries parent relative to root
         * @param children         The set of paths, from the root, representing children of this entry if any
         */
        @JsonCreator
        public ManifestEntry(@JsonProperty(value = "path", required = true) final String path,
                @JsonProperty(value = "name", required = true) final String name,
                @JsonProperty(value = "lastModifiedTime", required = true) final Instant lastModifiedTime,
                @JsonProperty(value = "lastAccessTime", required = true) final Instant lastAccessTime,
                @JsonProperty(value = "creationTime", required = true) final Instant creationTime,
                @JsonProperty(value = "directory", required = true) final boolean directory,
                @JsonProperty(value = "size", required = true) final long size,
                @JsonProperty(value = "md5") @Nullable final String md5,
                @JsonProperty(value = "mimeType") @Nullable final String mimeType,
                @JsonProperty(value = "parent") @Nullable final String parent,
                @JsonProperty(value = "children", required = true) final Set<String> children) {
            this.path = path;
            this.name = name;
            this.lastModifiedTime = lastModifiedTime;
            this.lastAccessTime = lastAccessTime;
            this.creationTime = creationTime;
            this.directory = directory;
            this.size = size;
            this.md5 = md5;
            this.mimeType = mimeType;
            this.parent = parent;
            this.children = ImmutableSet.copyOf(children);
        }

        /**
         * Get the MD5 hash of the file (as 32 hex characters) if it was calculated.
         *
         * @return The MD5 value or {@link Optional#empty()}
         */
        public Optional<String> getMd5() {
            return Optional.ofNullable(this.md5);
        }

        /**
         * Get the mime type of this file if it was calculated.
         *
         * @return The mime type value or {@link Optional#empty()}
         */
        public Optional<String> getMimeType() {
            return Optional.ofNullable(this.mimeType);
        }

        /**
         * Get the relative path from root of the parent of this entry if there was one.
         * There likely wouldn't be one for the root of the job directory.
         *
         * @return The relative path from root of the parent wrapped in an {@link Optional}
         */
        public Optional<String> getParent() {
            return Optional.ofNullable(this.parent);
        }
    }
}