org.apache.beam.sdk.io.FileSystems.java Source code

Introduction

Here is the source code for org.apache.beam.sdk.io.FileSystems.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Verify.verify;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Predicate;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimap;
import com.google.common.collect.Ordering;
import com.google.common.collect.Sets;
import com.google.common.collect.TreeMultimap;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.ServiceLoader;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nonnull;
import org.apache.beam.sdk.annotations.Experimental;
import org.apache.beam.sdk.annotations.Experimental.Kind;
import org.apache.beam.sdk.annotations.Internal;
import org.apache.beam.sdk.io.fs.CreateOptions;
import org.apache.beam.sdk.io.fs.CreateOptions.StandardCreateOptions;
import org.apache.beam.sdk.io.fs.MatchResult;
import org.apache.beam.sdk.io.fs.MatchResult.Metadata;
import org.apache.beam.sdk.io.fs.MatchResult.Status;
import org.apache.beam.sdk.io.fs.MoveOptions;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.util.common.ReflectHelpers;
import org.apache.beam.sdk.values.KV;

/**
 * Clients facing {@link FileSystem} utility.
 */
@Experimental(Kind.FILESYSTEM)
public class FileSystems {

    public static final String DEFAULT_SCHEME = "default";
    private static final Pattern FILE_SCHEME_PATTERN = Pattern.compile("(?<scheme>[a-zA-Z][-a-zA-Z0-9+.]*):.*");

    private static final AtomicReference<Map<String, FileSystem>> SCHEME_TO_FILESYSTEM = new AtomicReference<Map<String, FileSystem>>(
            ImmutableMap.<String, FileSystem>of("file", new LocalFileSystem()));

    /********************************** METHODS FOR CLIENT **********************************/

    /**
     * This is the entry point to convert user-provided specs to {@link ResourceId ResourceIds}.
     * Callers should use {@link #match} to resolve users specs ambiguities before
     * calling other methods.
     *
     * <p>Implementation handles the following ambiguities of a user-provided spec:
     * <ol>
     * <li>{@code spec} could be a glob or a uri. {@link #match} should be able to tell and
     * choose efficient implementations.
     * <li>The user-provided {@code spec} might refer to files or directories. It is common that
     * users that wish to indicate a directory will omit the trailing path delimiter, such as
     * {@code "/tmp/dir"} in Linux. The {@link FileSystem} should be able to recognize a directory
     * with the trailing path delimiter omitted, but should always return a correct {@link ResourceId}
     * (e.g., {@code "/tmp/dir/"} inside the returned {@link MatchResult}.
     * </ol>
     *
     * <p>All {@link FileSystem} implementations should support glob in the final hierarchical path
     * component of {@link ResourceId}. This allows SDK libraries to construct file system agnostic
     * spec. {@link FileSystem FileSystems} can support additional patterns for user-provided specs.
     *
     * @return {@code List<MatchResult>} in the same order of the input specs.
     *
     * @throws IllegalArgumentException if specs are invalid -- empty or have different schemes.
     * @throws IOException if all specs failed to match due to issues like:
     * network connection, authorization.
     * Exception for individual spec is deferred until callers retrieve
     * metadata with {@link MatchResult#metadata()}.
     */
    public static List<MatchResult> match(List<String> specs) throws IOException {
        return getFileSystemInternal(getOnlyScheme(specs)).match(specs);
    }

    /**
     * Like {@link #match(List)}, but for a single resource specification.
     *
     * <p>The function {@link #match(List)} is preferred when matching multiple patterns, as it allows
     * for bulk API calls to remote filesystems.
     */
    public static MatchResult match(String spec) throws IOException {
        List<MatchResult> matches = match(Collections.singletonList(spec));
        verify(matches.size() == 1, "FileSystem implementation for %s did not return exactly one MatchResult: %s",
                spec, matches);
        return matches.get(0);
    }

    /**
     * Returns the {@link Metadata} for a single file resource. Expects a resource specification
     * {@code spec} that matches a single result.
     *
     * @param spec a resource specification that matches exactly one result.
     * @return the {@link Metadata} for the specified resource.
     * @throws FileNotFoundException if the file resource is not found.
     * @throws IOException in the event of an error in the inner call to {@link #match},
     * or if the given spec does not match exactly 1 result.
     */
    public static Metadata matchSingleFileSpec(String spec) throws IOException {
        List<MatchResult> matches = FileSystems.match(Collections.singletonList(spec));
        MatchResult matchResult = Iterables.getOnlyElement(matches);
        if (matchResult.status() == Status.NOT_FOUND) {
            throw new FileNotFoundException(String.format("File spec %s not found", spec));
        } else if (matchResult.status() != Status.OK) {
            throw new IOException(
                    String.format("Error matching file spec %s: status %s", spec, matchResult.status()));
        } else {
            List<Metadata> metadata = matchResult.metadata();
            if (metadata.size() != 1) {
                throw new IOException(
                        String.format("Expecting spec %s to match exactly one file, but matched %s: %s", spec,
                                metadata.size(), metadata));
            }
            return metadata.get(0);
        }
    }

    /**
     * Returns {@link MatchResult MatchResults} for the given {@link ResourceId resourceIds}.
     *
     * @param resourceIds {@link ResourceId resourceIds} that might be derived from {@link #match},
     * {@link ResourceId#resolve}, or {@link ResourceId#getCurrentDirectory()}.
     *
     * @throws IOException if all {@code resourceIds} failed to match due to issues like:
     * network connection, authorization.
     * Exception for individual {@link ResourceId} need to be deferred until callers retrieve
     * metadata with {@link MatchResult#metadata()}.
     */
    public static List<MatchResult> matchResources(List<ResourceId> resourceIds) throws IOException {
        return match(FluentIterable.from(resourceIds).transform(new Function<ResourceId, String>() {
            @Override
            public String apply(@Nonnull ResourceId resourceId) {
                return resourceId.toString();
            }
        }).toList());
    }

    /**
     * Returns a write channel for the given {@link ResourceId}.
     *
     * <p>The resource is not expanded; it is used verbatim.
     *
     * @param resourceId the reference of the file-like resource to create
     * @param mimeType the mine type of the file-like resource to create
     */
    public static WritableByteChannel create(ResourceId resourceId, String mimeType) throws IOException {
        return create(resourceId, StandardCreateOptions.builder().setMimeType(mimeType).build());
    }

    /**
     * Returns a write channel for the given {@link ResourceId} with {@link CreateOptions}.
     *
     * <p>The resource is not expanded; it is used verbatim.
     *
     * @param resourceId the reference of the file-like resource to create
     * @param createOptions the configuration of the create operation
     */
    public static WritableByteChannel create(ResourceId resourceId, CreateOptions createOptions)
            throws IOException {
        return getFileSystemInternal(resourceId.getScheme()).create(resourceId, createOptions);
    }

    /**
     * Returns a read channel for the given {@link ResourceId}.
     *
     * <p>The resource is not expanded; it is used verbatim.
     *
     * <p>If seeking is supported, then this returns a
     * {@link java.nio.channels.SeekableByteChannel}.
     *
     * @param resourceId the reference of the file-like resource to open
     */
    public static ReadableByteChannel open(ResourceId resourceId) throws IOException {
        return getFileSystemInternal(resourceId.getScheme()).open(resourceId);
    }

    /**
     * Copies a {@link List} of file-like resources from one location to another.
     *
     * <p>The number of source resources must equal the number of destination resources.
     * Destination resources will be created recursively.
     *
     * <p>{@code srcResourceIds} and {@code destResourceIds} must have the same scheme.
     *
     * <p>It doesn't support copying globs.
     *
     * @param srcResourceIds the references of the source resources
     * @param destResourceIds the references of the destination resources
     */
    public static void copy(List<ResourceId> srcResourceIds, List<ResourceId> destResourceIds,
            MoveOptions... moveOptions) throws IOException {
        validateSrcDestLists(srcResourceIds, destResourceIds);
        if (srcResourceIds.isEmpty()) {
            // Short-circuit.
            return;
        }

        List<ResourceId> srcToCopy = srcResourceIds;
        List<ResourceId> destToCopy = destResourceIds;
        if (Sets.newHashSet(moveOptions).contains(MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES)) {
            KV<List<ResourceId>, List<ResourceId>> existings = filterMissingFiles(srcResourceIds, destResourceIds);
            srcToCopy = existings.getKey();
            destToCopy = existings.getValue();
        }
        if (srcToCopy.isEmpty()) {
            return;
        }
        getFileSystemInternal(srcToCopy.iterator().next().getScheme()).copy(srcToCopy, destToCopy);
    }

    /**
     * Renames a {@link List} of file-like resources from one location to another.
     *
     * <p>The number of source resources must equal the number of destination resources.
     * Destination resources will be created recursively.
     *
     * <p>{@code srcResourceIds} and {@code destResourceIds} must have the same scheme.
     *
     * <p>It doesn't support renaming globs.
     *
     * @param srcResourceIds the references of the source resources
     * @param destResourceIds the references of the destination resources
     */
    public static void rename(List<ResourceId> srcResourceIds, List<ResourceId> destResourceIds,
            MoveOptions... moveOptions) throws IOException {
        validateSrcDestLists(srcResourceIds, destResourceIds);
        if (srcResourceIds.isEmpty()) {
            // Short-circuit.
            return;
        }

        List<ResourceId> srcToRename = srcResourceIds;
        List<ResourceId> destToRename = destResourceIds;
        if (Sets.newHashSet(moveOptions).contains(MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES)) {
            KV<List<ResourceId>, List<ResourceId>> existings = filterMissingFiles(srcResourceIds, destResourceIds);
            srcToRename = existings.getKey();
            destToRename = existings.getValue();
        }
        if (srcToRename.isEmpty()) {
            return;
        }
        getFileSystemInternal(srcToRename.iterator().next().getScheme()).rename(srcToRename, destToRename);
    }

    /**
     * Deletes a collection of resources.
     *
     * <p>It is allowed but not recommended to delete directories recursively.
     * Callers depends on {@link FileSystems} and uses {@code DeleteOptions}.
     *
     * <p>{@code resourceIds} must have the same scheme.
     *
     * @param resourceIds the references of the resources to delete.
     */
    public static void delete(Collection<ResourceId> resourceIds, MoveOptions... moveOptions) throws IOException {
        if (resourceIds.isEmpty()) {
            // Short-circuit.
            return;
        }

        Collection<ResourceId> resourceIdsToDelete;
        if (Sets.newHashSet(moveOptions).contains(MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES)) {
            resourceIdsToDelete = FluentIterable.from(matchResources(Lists.newArrayList(resourceIds)))
                    .filter(new Predicate<MatchResult>() {
                        @Override
                        public boolean apply(@Nonnull MatchResult matchResult) {
                            return !matchResult.status().equals(MatchResult.Status.NOT_FOUND);
                        }
                    }).transformAndConcat(new Function<MatchResult, Iterable<Metadata>>() {
                        @Nonnull
                        @Override
                        public Iterable<Metadata> apply(@Nonnull MatchResult input) {
                            try {
                                return Lists.newArrayList(input.metadata());
                            } catch (IOException e) {
                                throw new RuntimeException(
                                        String.format("Failed to get metadata from MatchResult: %s.", input), e);
                            }
                        }
                    }).transform(new Function<Metadata, ResourceId>() {
                        @Nonnull
                        @Override
                        public ResourceId apply(@Nonnull Metadata input) {
                            return input.resourceId();
                        }
                    }).toList();
        } else {
            resourceIdsToDelete = resourceIds;
        }
        if (resourceIdsToDelete.isEmpty()) {
            return;
        }
        getFileSystemInternal(resourceIdsToDelete.iterator().next().getScheme()).delete(resourceIdsToDelete);
    }

    private static KV<List<ResourceId>, List<ResourceId>> filterMissingFiles(List<ResourceId> srcResourceIds,
            List<ResourceId> destResourceIds) throws IOException {
        validateSrcDestLists(srcResourceIds, destResourceIds);
        if (srcResourceIds.isEmpty()) {
            // Short-circuit.
            return KV.of(Collections.<ResourceId>emptyList(), Collections.<ResourceId>emptyList());
        }

        List<ResourceId> srcToHandle = new ArrayList<>();
        List<ResourceId> destToHandle = new ArrayList<>();

        List<MatchResult> matchResults = matchResources(srcResourceIds);
        for (int i = 0; i < matchResults.size(); ++i) {
            if (!matchResults.get(i).status().equals(Status.NOT_FOUND)) {
                srcToHandle.add(srcResourceIds.get(i));
                destToHandle.add(destResourceIds.get(i));
            }
        }
        return KV.of(srcToHandle, destToHandle);
    }

    private static void validateSrcDestLists(List<ResourceId> srcResourceIds, List<ResourceId> destResourceIds) {
        checkArgument(srcResourceIds.size() == destResourceIds.size(),
                "Number of source resource ids %s must equal number of destination resource ids %s",
                srcResourceIds.size(), destResourceIds.size());

        if (srcResourceIds.isEmpty()) {
            // nothing more to validate.
            return;
        }

        Set<String> schemes = FluentIterable.from(srcResourceIds).append(destResourceIds)
                .transform(new Function<ResourceId, String>() {
                    @Override
                    public String apply(@Nonnull ResourceId resourceId) {
                        return resourceId.getScheme();
                    }
                }).toSet();
        checkArgument(schemes.size() == 1,
                String.format("Expect srcResourceIds and destResourceIds have the same scheme, but received %s.",
                        Joiner.on(", ").join(schemes)));
    }

    private static String getOnlyScheme(List<String> specs) {
        checkArgument(!specs.isEmpty(), "Expect specs are not empty.");
        Set<String> schemes = FluentIterable.from(specs).transform(new Function<String, String>() {
            @Override
            public String apply(String spec) {
                return parseScheme(spec);
            }
        }).toSet();
        return Iterables.getOnlyElement(schemes);
    }

    private static String parseScheme(String spec) {
        // The spec is almost, but not quite, a URI. In particular,
        // the reserved characters '[', ']', and '?' have meanings that differ
        // from their use in the URI spec. ('*' is not reserved).
        // Here, we just need the scheme, which is so circumscribed as to be
        // very easy to extract with a regex.
        Matcher matcher = FILE_SCHEME_PATTERN.matcher(spec);

        if (!matcher.matches()) {
            return "file";
        } else {
            return matcher.group("scheme").toLowerCase();
        }
    }

    /**
     * Internal method to get {@link FileSystem} for {@code scheme}.
     */
    @VisibleForTesting
    static FileSystem getFileSystemInternal(String scheme) {
        String lowerCaseScheme = scheme.toLowerCase();
        Map<String, FileSystem> schemeToFileSystem = SCHEME_TO_FILESYSTEM.get();
        FileSystem rval = schemeToFileSystem.get(lowerCaseScheme);
        if (rval != null) {
            return rval;
        }
        rval = schemeToFileSystem.get(DEFAULT_SCHEME);
        if (rval != null) {
            return rval;
        }
        throw new IllegalStateException("Unable to find registrar for " + scheme);
    }

    /********************************** METHODS FOR REGISTRATION **********************************/

    /**
     * Sets the default configuration in workers.
     *
     * <p>It will be used in {@link FileSystemRegistrar FileSystemRegistrars} for all schemes.
     *
     * <p>This is expected only to be used by runners after {@code Pipeline.run}, or in tests.
     */
    @Internal
    public static void setDefaultPipelineOptions(PipelineOptions options) {
        checkNotNull(options, "options");
        Set<FileSystemRegistrar> registrars = Sets.newTreeSet(ReflectHelpers.ObjectsClassComparator.INSTANCE);
        registrars.addAll(Lists
                .newArrayList(ServiceLoader.load(FileSystemRegistrar.class, ReflectHelpers.findClassLoader())));

        SCHEME_TO_FILESYSTEM.set(verifySchemesAreUnique(options, registrars));
    }

    @VisibleForTesting
    static Map<String, FileSystem> verifySchemesAreUnique(PipelineOptions options,
            Set<FileSystemRegistrar> registrars) {
        Multimap<String, FileSystem> fileSystemsBySchemes = TreeMultimap.create(Ordering.<String>natural(),
                Ordering.arbitrary());

        for (FileSystemRegistrar registrar : registrars) {
            for (FileSystem fileSystem : registrar.fromOptions(options)) {
                fileSystemsBySchemes.put(fileSystem.getScheme(), fileSystem);
            }
        }
        for (Entry<String, Collection<FileSystem>> entry : fileSystemsBySchemes.asMap().entrySet()) {
            if (entry.getValue().size() > 1) {
                String conflictingFileSystems = Joiner.on(", ")
                        .join(FluentIterable.from(entry.getValue()).transform(new Function<FileSystem, String>() {
                            @Override
                            public String apply(@Nonnull FileSystem input) {
                                return input.getClass().getName();
                            }
                        }).toSortedList(Ordering.<String>natural()));
                throw new IllegalStateException(String.format("Scheme: [%s] has conflicting filesystems: [%s]",
                        entry.getKey(), conflictingFileSystems));
            }
        }

        ImmutableMap.Builder<String, FileSystem> schemeToFileSystem = ImmutableMap.builder();
        for (Entry<String, FileSystem> entry : fileSystemsBySchemes.entries()) {
            schemeToFileSystem.put(entry.getKey(), entry.getValue());
        }
        return schemeToFileSystem.build();
    }

    /**
     * Returns a new {@link ResourceId} that represents the named resource of a type corresponding
     * to the resource type.
     *
     * <p>The supplied {@code singleResourceSpec} is expected to be in a proper format, including
     * any necessary escaping, for the underlying {@link FileSystem}.
     *
     * <p>This function may throw an {@link IllegalArgumentException} if given an invalid argument,
     * such as when the specified {@code singleResourceSpec} is not a valid resource name.
     */
    public static ResourceId matchNewResource(String singleResourceSpec, boolean isDirectory) {
        return getFileSystemInternal(parseScheme(singleResourceSpec)).matchNewResource(singleResourceSpec,
                isDirectory);
    }
}