org.kitesdk.data.spi.filesystem.FileSystemMetadataProvider.java Source code

Introduction

Here is the source code for org.kitesdk.data.spi.filesystem.FileSystemMetadataProvider.java
Source

/**
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.kitesdk.data.spi.filesystem;

import com.google.common.annotations.VisibleForTesting;
import org.kitesdk.data.Dataset;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.DatasetExistsException;
import org.kitesdk.data.DatasetException;
import org.kitesdk.data.DatasetIOException;
import org.kitesdk.data.DatasetNotFoundException;
import org.kitesdk.data.impl.Accessor;
import org.kitesdk.data.spi.AbstractDatasetRepository;
import org.kitesdk.data.spi.AbstractMetadataProvider;
import org.kitesdk.data.spi.MetadataProvider;
import com.google.common.base.Charsets;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.io.Closeables;
import java.io.FileNotFoundException;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.kitesdk.data.spi.Compatibility;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;

/**
 * <p>
 * A {@link MetadataProvider} that stores dataset metadata in a Hadoop
 * {@link FileSystem}.
 * </p>
 * <p>
 * When configured with a root directory, this implementation serializes the
 * information within a {@link org.kitesdk.data.DatasetDescriptor} on the provided
 * {@link FileSystem}. The descriptor is serialized as an Avro object and stored
 * in a directory named after the dataset name. For example, if the dataset name
 * is {@code logs}, the directory {@code rootDirectory/logs/} will be created,
 * if it doesn't exist, and the serialized descriptor will be stored in the file
 * {@code descriptor.avro}.
 * </p>
 */
public class FileSystemMetadataProvider extends AbstractMetadataProvider {

    private static final Logger LOG = LoggerFactory.getLogger(FileSystemMetadataProvider.class);

    private static final String METADATA_DIRECTORY = ".metadata";
    private static final String SCHEMA_FILE_NAME = "schema.avsc";
    private static final String SCHEMA_DIRECTORY_NAME = "schemas";
    private static final String DESCRIPTOR_FILE_NAME = "descriptor.properties";
    private static final String PARTITION_EXPRESSION_FIELD_NAME = "partitionExpression";
    private static final String VERSION_FIELD_NAME = "version";
    private static final String METADATA_VERSION = "1";
    private static final String FORMAT_FIELD_NAME = "format";
    private static final String LOCATION_FIELD_NAME = "location";
    private static final String COMPRESSION_TYPE_FIELD_NAME = "compressionType";
    private static final String DEFAULT_NAMESPACE = "default";

    private static final Set<String> RESERVED_PROPERTIES = Sets.newHashSet(PARTITION_EXPRESSION_FIELD_NAME,
            VERSION_FIELD_NAME, FORMAT_FIELD_NAME, LOCATION_FIELD_NAME, COMPRESSION_TYPE_FIELD_NAME);

    private final Configuration conf;
    private final Path rootDirectory;

    // cache the rootDirectory's FileSystem to avoid multiple lookups
    private transient final FileSystem rootFileSystem;

    public FileSystemMetadataProvider(Configuration conf, Path rootDirectory) {
        Preconditions.checkNotNull(conf, "Configuration cannot be null");
        Preconditions.checkNotNull(rootDirectory, "Root directory cannot be null");

        this.conf = conf;
        try {
            this.rootFileSystem = rootDirectory.getFileSystem(conf);
            this.rootDirectory = rootFileSystem.makeQualified(rootDirectory);
        } catch (IOException ex) {
            throw new DatasetIOException("Cannot get FileSystem for root path", ex);
        }
    }

    @Override
    public DatasetDescriptor load(String namespace, String name) {
        Preconditions.checkNotNull(namespace, "Namespace cannot be null");
        Preconditions.checkNotNull(name, "Dataset name cannot be null");

        LOG.debug("Loading dataset metadata name: {}", name);

        Path metadataPath = find(namespace, name);

        InputStream inputStream = null;
        Properties properties = new Properties();
        DatasetDescriptor.Builder builder = new DatasetDescriptor.Builder();
        Path descriptorPath = new Path(metadataPath, DESCRIPTOR_FILE_NAME);

        boolean threw = true;
        try {
            inputStream = rootFileSystem.open(descriptorPath);
            properties.load(inputStream);
            threw = false;
        } catch (IOException e) {
            throw new DatasetIOException(
                    "Unable to load descriptor file:" + descriptorPath + " for dataset:" + name, e);
        } finally {
            try {
                Closeables.close(inputStream, threw);
            } catch (IOException e) {
                throw new DatasetIOException("Cannot close", e);
            }
        }

        if (properties.containsKey(FORMAT_FIELD_NAME)) {
            builder.format(Accessor.getDefault().newFormat(properties.getProperty(FORMAT_FIELD_NAME)));
        }
        if (properties.containsKey(COMPRESSION_TYPE_FIELD_NAME)) {
            builder.compressionType(properties.getProperty(COMPRESSION_TYPE_FIELD_NAME));
        }
        if (properties.containsKey(PARTITION_EXPRESSION_FIELD_NAME)) {
            builder.partitionStrategy(
                    Accessor.getDefault().fromExpression(properties.getProperty(PARTITION_EXPRESSION_FIELD_NAME)));
        }

        SchemaManager manager = SchemaManager.load(conf, new Path(metadataPath, SCHEMA_DIRECTORY_NAME));

        URI schemaURI;

        if (manager == null) {

            // If there is no schema manager directory we are working
            // with a dataset written with an older version. Therefore
            // we just use a reference to the existing schema.
            Path schemaPath = new Path(metadataPath, SCHEMA_FILE_NAME);

            checkExists(getFileSytem(), schemaPath);

            schemaURI = rootFileSystem.makeQualified(schemaPath).toUri();

        } else {
            schemaURI = manager.getNewestSchemaURI();
        }

        if (schemaURI == null) {
            throw new DatasetException(
                    "Maformed dataset metadata at " + metadataPath + ". No schemas are present.");
        }

        try {
            builder.schemaUri(schemaURI);
        } catch (IOException e) {
            throw new DatasetIOException("Unable to load schema file:" + schemaURI + " for dataset:" + name, e);
        }

        final Path location;
        if (properties.containsKey(LOCATION_FIELD_NAME)) {
            // the location should always be written by this library and validated
            // when the descriptor is first created.
            location = new Path(properties.getProperty(LOCATION_FIELD_NAME));
        } else {
            // backwards-compatibility: older versions didn't write this property but
            // the data and metadata were always co-located.
            location = expectedPathForDataset(namespace, name);
        }
        builder.location(location);

        // custom properties
        for (String property : properties.stringPropertyNames()) {
            if (!RESERVED_PROPERTIES.contains(property)) {
                builder.property(property, properties.getProperty(property));
            }
        }

        return builder.build();
    }

    @Override
    public DatasetDescriptor create(String namespace, String name, DatasetDescriptor descriptor) {
        Preconditions.checkNotNull(namespace, "Namespace cannot be null");
        Preconditions.checkNotNull(name, "Dataset name cannot be null");
        Preconditions.checkNotNull(descriptor, "Descriptor cannot be null");
        Compatibility.check(namespace, name, descriptor);

        LOG.debug("Saving dataset metadata name:{} descriptor:{}", name, descriptor);

        // no need to check backward-compatibility when creating new datasets
        Path metadataLocation = pathForMetadata(namespace, name);

        try {
            if (rootFileSystem.exists(metadataLocation)) {
                throw new DatasetExistsException("Descriptor directory already exists: " + metadataLocation);
            }
            // create the directory so that update can do the rest of the work
            rootFileSystem.mkdirs(metadataLocation);
        } catch (IOException e) {
            throw new DatasetIOException(
                    "Unable to create metadata directory: " + metadataLocation + " for dataset: " + name, e);
        }

        writeDescriptor(rootFileSystem, metadataLocation, name, descriptor);

        return descriptor;
    }

    @Override
    public DatasetDescriptor update(String namespace, String name, DatasetDescriptor descriptor) {
        Preconditions.checkNotNull(namespace, "Namespace cannot be null");
        Preconditions.checkNotNull(name, "Dataset name cannot be null");
        Preconditions.checkNotNull(descriptor, "Descriptor cannot be null");
        Compatibility.check(namespace, name, descriptor);

        LOG.debug("Saving dataset metadata name: {} descriptor: {}", name, descriptor);

        Path metadataPath = find(namespace, name);
        writeDescriptor(rootFileSystem, metadataPath, name, descriptor);

        return descriptor;
    }

    @Override
    public boolean delete(String namespace, String name) {
        Preconditions.checkNotNull(namespace, "Namespace cannot be null");
        Preconditions.checkNotNull(name, "Dataset name cannot be null");

        LOG.debug("Deleting dataset metadata name: {}", name);

        Path metadataDirectory;
        try {
            metadataDirectory = find(namespace, name);
        } catch (DatasetNotFoundException _) {
            return false;
        }

        try {
            if (rootFileSystem.exists(metadataDirectory)) {
                if (rootFileSystem.delete(metadataDirectory, true)) {
                    return true;
                } else {
                    throw new IOException("Failed to delete metadata directory:" + metadataDirectory);
                }
            } else {
                return false;
            }
        } catch (IOException e) {
            throw new DatasetIOException(
                    "Unable to find or delete metadata directory:" + metadataDirectory + " for dataset:" + name, e);
        }
    }

    @Override
    public boolean exists(String namespace, String name) {
        Preconditions.checkNotNull(namespace, "Namespace cannot be null");
        Preconditions.checkNotNull(name, "Dataset name cannot be null");

        try {
            find(namespace, name);
            return true;
        } catch (DatasetNotFoundException e) {
            return false;
        }
    }

    @SuppressWarnings("deprecation")
    @Override
    public Set<String> namespaces() {
        Set<String> namespaces = Sets.newHashSet();
        try {
            FileStatus[] entries = rootFileSystem.listStatus(rootDirectory, PathFilters.notHidden());
            for (FileStatus entry : entries) {
                if (entry.isDir()) {
                    // may want to add a check: !RESERVED_NAMES.contains(name)
                    if (isNamespace(entry.getPath())) {
                        namespaces.add(entry.getPath().getName());

                    } else if (isDataset(entry.getPath())) {
                        // add the default namespace for datasets with no namespace
                        namespaces.add(DEFAULT_NAMESPACE);
                    }
                }
            }
        } catch (FileNotFoundException ex) {
            // the repo hasn't created any files yet
            return namespaces;
        } catch (IOException ex) {
            throw new DatasetIOException("Could not list namespaces", ex);
        }
        return namespaces;
    }

    @SuppressWarnings("deprecation")
    @Override
    public Set<String> datasets(String namespace) {
        Preconditions.checkNotNull(namespace, "Namespace cannot be null");

        Set<String> datasets = Sets.newHashSet();

        try {
            // if using the default namespace, add datasets with no namespace dir
            if (DEFAULT_NAMESPACE.equals(namespace)) {
                FileStatus[] directEntries = rootFileSystem.listStatus(rootDirectory, PathFilters.notHidden());
                for (FileStatus entry : directEntries) {
                    if (entry.isDir() && isDataset(entry.getPath())) {
                        // may want to add a check: !RESERVED_NAMES.contains(name)
                        datasets.add(entry.getPath().getName());
                    }
                }
            }
        } catch (FileNotFoundException e) {
            // if the root directory doesn't exist, then no namespace directories do
            return datasets;
        } catch (IOException ex) {
            throw new DatasetIOException("Could not list datasets", ex);
        }

        try {
            FileStatus[] entries = rootFileSystem.listStatus(new Path(rootDirectory, namespace),
                    PathFilters.notHidden());
            for (FileStatus entry : entries) {
                if (entry.isDir() && isDataset(entry.getPath())) {
                    // may want to add a check: !RESERVED_NAMES.contains(name)
                    datasets.add(entry.getPath().getName());
                }
            }

        } catch (FileNotFoundException ex) {
            // the repo hasn't created any files yet
            return datasets;
        } catch (IOException ex) {
            throw new DatasetIOException("Could not list datasets", ex);
        }
        return datasets;
    }

    /**
     * Returns whether the given {@code Path} contains directories with
     * {@code Dataset} metadata.
     *
     * @param dir a Path to check
     * @return {@code true} if there is a direct sub-directory with metadata
     * @throws IOException
     */
    @SuppressWarnings("deprecation")
    private boolean isNamespace(Path dir) throws IOException {
        FileStatus[] stats = rootFileSystem.listStatus(dir, PathFilters.notHidden());
        for (FileStatus stat : stats) {
            if (stat.isDir() && isDataset(stat.getPath())) {
                return true;
            }
        }
        return false;
    }

    /**
     * Returns whether the given {@code Path} contains {@code Dataset} metadata.
     *
     * @param dir a Path to check
     * @return {@code true} if there is a .metadata subdirectory
     * @throws IOException
     */
    private boolean isDataset(Path dir) throws IOException {
        return rootFileSystem.isDirectory(new Path(dir, METADATA_DIRECTORY));
    }

    /**
     * Returns the root directory where metadata is stored.
     *
     * @return a Path where {@link DatasetDescriptor}s are stored
     *
     * @since 0.8.0
     */
    Path getRootDirectory() {
        return rootDirectory;
    }

    /**
     * Returns the file system where metadata is stored.
     *
     * @return a FileSystem
     *
     * @since 0.8.0
     */
    FileSystem getFileSytem() {
        return rootFileSystem;
    }

    @Override
    public String toString() {
        return Objects.toStringHelper(this).add("rootDirectory", rootDirectory).add("conf", conf).toString();
    }

    private Path expectedPathForDataset(String namespace, String name) {
        return rootFileSystem
                .makeQualified(FileSystemDatasetRepository.pathForDataset(rootDirectory, namespace, name));
    }

    /**
     * Returns the path where this MetadataProvider will store metadata.
     *
     * Note that this is not dependent on the actual storage location for the
     * dataset, although they are usually co-located. This provider must be able
     * to read metadata without a location for the Dataset when loading.
     *
     * @param name The {@link Dataset} name
     * @return The directory {@link Path} where metadata files will be located
     */
    private Path pathForMetadata(String namespace, String name) {
        return pathForMetadata(rootDirectory, namespace, name);
    }

    /**
     * Writes the contents of a {@code Descriptor} to files.
     *
     * @param fs                The {@link FileSystem} where data will be stored
     * @param metadataLocation  The directory {@link Path} where metadata files
     *                          will be located
     * @param name              The {@link Dataset} name
     * @param descriptor        The {@code Descriptor} contents to write
     *
     * @throws org.kitesdk.data.DatasetIOException
     *                          If the {@code metadataLocation} does not exist or
     *                          if any IOExceptions need to be propagated.
     */
    @VisibleForTesting
    static void writeDescriptor(FileSystem fs, Path metadataLocation, String name, DatasetDescriptor descriptor) {

        checkExists(fs, metadataLocation);

        // write the schema to the previous file location so
        // it can be read by earlier versions of Kite
        FSDataOutputStream outputStream = null;
        final Path schemaPath = new Path(metadataLocation, SCHEMA_FILE_NAME);
        boolean threw = true;
        try {
            outputStream = fs.create(schemaPath, true /* overwrite */ );
            outputStream.write(descriptor.getSchema().toString(true).getBytes(Charsets.UTF_8));
            outputStream.flush();
            threw = false;
        } catch (IOException e) {
            throw new DatasetIOException("Unable to save schema file: " + schemaPath + " for dataset: " + name, e);
        } finally {
            try {
                Closeables.close(outputStream, threw);
            } catch (IOException e) {
                throw new DatasetIOException("Cannot close", e);
            }
        }

        // use the SchemaManager for schema operations moving forward
        SchemaManager manager = SchemaManager.create(fs.getConf(),
                new Path(metadataLocation, SCHEMA_DIRECTORY_NAME));

        manager.writeSchema(descriptor.getSchema());

        Properties properties = new Properties();
        properties.setProperty(VERSION_FIELD_NAME, METADATA_VERSION);
        properties.setProperty(FORMAT_FIELD_NAME, descriptor.getFormat().getName());
        properties.setProperty(COMPRESSION_TYPE_FIELD_NAME, descriptor.getCompressionType().getName());

        final URI dataLocation = descriptor.getLocation();
        if (dataLocation != null) {
            properties.setProperty(LOCATION_FIELD_NAME, dataLocation.toString());
        }

        if (descriptor.isPartitioned()) {
            properties.setProperty(PARTITION_EXPRESSION_FIELD_NAME,
                    Accessor.getDefault().toExpression(descriptor.getPartitionStrategy()));
        }

        // copy custom properties to the table
        for (String property : descriptor.listProperties()) {
            // no need to check the reserved list, those are not set on descriptors
            properties.setProperty(property, descriptor.getProperty(property));
        }

        final Path descriptorPath = new Path(metadataLocation, DESCRIPTOR_FILE_NAME);
        threw = true;
        try {
            outputStream = fs.create(descriptorPath, true /* overwrite */ );
            properties.store(outputStream, "Dataset descriptor for " + name);
            outputStream.flush();
            threw = false;
        } catch (IOException e) {
            throw new DatasetIOException(
                    "Unable to save descriptor file: " + descriptorPath + " for dataset: " + name, e);
        } finally {
            try {
                Closeables.close(outputStream, threw);
            } catch (IOException e) {
                throw new DatasetIOException("Cannot close", e);
            }
        }
    }

    /**
     * Returns the correct metadata path for the given dataset.
     * @param root A Path
     * @param name A String dataset name
     * @return the metadata Path
     */
    private static Path pathForMetadata(Path root, String namespace, String name) {
        return new Path(FileSystemDatasetRepository.pathForDataset(root, namespace, name), METADATA_DIRECTORY);
    }

    /**
     * Precondition-style static validation that a dataset exists
     *
     * @param fs        A FileSystem where the metadata should be stored
     * @param location  The Path where the metadata should be stored
     * @throws org.kitesdk.data.DatasetNotFoundException if the descriptor location is missing
     * @throws org.kitesdk.data.DatasetIOException  if any IOException is thrown
     */
    private static void checkExists(FileSystem fs, Path location) {
        try {
            if (!fs.exists(location)) {
                throw new DatasetNotFoundException("Descriptor location does not exist: " + location);
            }
        } catch (IOException ex) {
            throw new DatasetIOException("Cannot access descriptor location: " + location, ex);
        }
    }

    /**
     * This method provides backward-compatibility for finding metadata.
     * <p>
     * This handles the case where an existing program is opening a
     * DatasetRepository by URI. For example, the DatasetSink and maven plugin do
     * this. In that case, the repository URI will directly contain a directory
     * named for the dataset with .metadata in it. This checks for the updated
     * scheme and falls back to the old scheme if the namespace is "default".
     *
     * @param namespace the requested namespace.
     * @param name the dataset name.
     * @return a Path to the correct metadata directory
     * @throws DatasetNotFoundException if neither location has metadata
     */
    private Path find(String namespace, String name) {
        Path expectedPath = pathForMetadata(namespace, name);
        if (DEFAULT_NAMESPACE.equals(namespace)) {
            // when using the default namespace, the namespace may not be in the path
            try {
                checkExists(rootFileSystem, expectedPath);
                return expectedPath;
            } catch (DatasetNotFoundException e) {
                try {
                    Path backwardCompatiblePath = new Path(rootDirectory,
                            new Path(name.replace('.', Path.SEPARATOR_CHAR), METADATA_DIRECTORY));
                    checkExists(rootFileSystem, backwardCompatiblePath);
                    return backwardCompatiblePath;
                } catch (DatasetNotFoundException _) {
                    throw e; // throw the original
                }
            }

        } else {
            // no need to check other locations
            checkExists(rootFileSystem, expectedPath);
            return expectedPath;
        }
    }
}