Java tutorial
/** * Copyright 2013 Cloudera Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.cdk.data.filesystem; import com.cloudera.cdk.data.Dataset; import com.cloudera.cdk.data.DatasetDescriptor; import com.cloudera.cdk.data.DatasetRepositoryException; import com.cloudera.cdk.data.FieldPartitioner; import com.cloudera.cdk.data.IncompatibleSchemaException; import com.cloudera.cdk.data.MetadataProvider; import com.cloudera.cdk.data.MetadataProviderException; import com.cloudera.cdk.data.PartitionKey; import com.cloudera.cdk.data.PartitionStrategy; import com.cloudera.cdk.data.filesystem.impl.Accessor; import com.cloudera.cdk.data.spi.AbstractDatasetRepository; import com.google.common.base.Objects; import com.google.common.base.Preconditions; import com.google.common.base.Splitter; import com.google.common.base.Supplier; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.Collection; import java.util.Iterator; import java.util.List; import org.apache.avro.Schema; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * <p> * A {@link com.cloudera.cdk.data.DatasetRepository} that stores data in a Hadoop {@link FileSystem}. * </p> * <p> * Given a {@link FileSystem}, a root directory, and a {@link com.cloudera.cdk.data.MetadataProvider}, * this {@link com.cloudera.cdk.data.DatasetRepository} implementation can load and store * {@link com.cloudera.cdk.data.Dataset}s on both local filesystems as well as the Hadoop Distributed * FileSystem (HDFS). Users may directly instantiate this class with the three * dependencies above and then perform dataset-related operations using any of * the provided methods. The primary methods of interest will be * {@link #create(String, com.cloudera.cdk.data.DatasetDescriptor)}, * {@link #load(String)}, and * {@link #delete(String)} which create a new dataset, load an existing * dataset, or delete an existing dataset, respectively. Once a dataset has been created * or loaded, users can invoke the appropriate {@link com.cloudera.cdk.data.Dataset} methods to get a reader * or writer as needed. * </p> * <p> * {@link com.cloudera.cdk.data.DatasetWriter} instances returned from this * implementation have the following <code>flush()</code> method semantics. For Avro * files, <code>flush()</code> will invoke HDFS <code>hflush</code>, * which guarantees that client buffers are flushed, so new readers will see all * entries written up to that point. For Parquet files, <code>flush()</code> has no * effect. * </p> * * @see com.cloudera.cdk.data.DatasetRepository * @see com.cloudera.cdk.data.Dataset * @see com.cloudera.cdk.data.DatasetDescriptor * @see com.cloudera.cdk.data.PartitionStrategy * @see com.cloudera.cdk.data.MetadataProvider */ public class FileSystemDatasetRepository extends AbstractDatasetRepository { private static final Logger logger = LoggerFactory.getLogger(FileSystemDatasetRepository.class); static { Accessor.setDefault(new AccessorImpl()); } private final MetadataProvider metadataProvider; private final Configuration conf; /** * Construct a {@link FileSystemDatasetRepository} for the given * {@link MetadataProvider} for metadata storage. * * @param conf a {@link Configuration} for {@link FileSystem} access * @param metadataProvider the provider for metadata storage * * @since 0.8.0 */ public FileSystemDatasetRepository(Configuration conf, MetadataProvider metadataProvider) { Preconditions.checkArgument(conf != null, "Configuration cannot be null"); Preconditions.checkArgument(metadataProvider != null, "Metadata provider can not be null"); this.conf = conf; this.metadataProvider = metadataProvider; } @Override public <E> Dataset<E> create(String name, DatasetDescriptor descriptor) { Preconditions.checkArgument(name != null, "Name can not be null"); Preconditions.checkArgument(descriptor != null, "Descriptor can not be null"); Preconditions.checkArgument(descriptor.getLocation() == null, "Descriptor location cannot be set; " + "it is assigned by the MetadataProvider"); final DatasetDescriptor newDescriptor = metadataProvider.create(name, descriptor); final URI location = newDescriptor.getLocation(); if (location == null) { throw new DatasetRepositoryException( "[BUG] MetadataProvider did not assign a location to dataset:" + name); } ensureExists(newDescriptor, conf); logger.debug("Created dataset:{} schema:{} datasetPath:{}", new Object[] { name, newDescriptor.getSchema(), location.toString() }); return new FileSystemDataset.Builder().name(name).configuration(conf).descriptor(newDescriptor) .partitionKey(newDescriptor.isPartitioned() ? com.cloudera.cdk.data.impl.Accessor.getDefault().newPartitionKey() : null) .build(); } @Override public <E> Dataset<E> update(String name, DatasetDescriptor descriptor) { Preconditions.checkArgument(name != null, "Dataset name cannot be null"); Preconditions.checkArgument(descriptor != null, "DatasetDescriptro cannot be null"); DatasetDescriptor oldDescriptor = metadataProvider.load(name); // oldDescriptor is valid if load didn't throw NoSuchDatasetException if (!oldDescriptor.getFormat().equals(descriptor.getFormat())) { throw new DatasetRepositoryException("Cannot change dataset format from " + oldDescriptor.getFormat() + " to " + descriptor.getFormat()); } final URI oldLocation = oldDescriptor.getLocation(); if ((oldLocation != null) && !(oldLocation.equals(descriptor.getLocation()))) { throw new DatasetRepositoryException("Cannot change the dataset's location"); } if (oldDescriptor.isPartitioned() != descriptor.isPartitioned()) { throw new DatasetRepositoryException( "Cannot change an unpartitioned dataset to " + " partitioned or vice versa."); } else if (oldDescriptor.isPartitioned() && descriptor.isPartitioned() && !oldDescriptor.getPartitionStrategy().equals(descriptor.getPartitionStrategy())) { throw new DatasetRepositoryException("Cannot change partition strategy from " + oldDescriptor.getPartitionStrategy() + " to " + descriptor.getPartitionStrategy()); } // check can read records written with old schema using new schema final Schema oldSchema = oldDescriptor.getSchema(); final Schema newSchema = descriptor.getSchema(); if (!SchemaValidationUtil.canRead(oldSchema, newSchema)) { throw new IncompatibleSchemaException( "New schema cannot read data " + "written using " + "old schema. New schema: " + newSchema.toString(true) + "\nOld schema: " + oldSchema.toString(true)); } final DatasetDescriptor updatedDescriptor = metadataProvider.update(name, descriptor); logger.debug("Updated dataset:{} schema:{} datasetPath:{}", new Object[] { name, updatedDescriptor.getSchema(), updatedDescriptor.getLocation().toString() }); return new FileSystemDataset.Builder().name(name).configuration(conf).descriptor(updatedDescriptor) .partitionKey(updatedDescriptor.isPartitioned() ? com.cloudera.cdk.data.impl.Accessor.getDefault().newPartitionKey() : null) .build(); } @Override public <E> Dataset<E> load(String name) { Preconditions.checkArgument(name != null, "Name can not be null"); logger.debug("Loading dataset:{}", name); DatasetDescriptor descriptor = metadataProvider.load(name); FileSystemDataset<E> ds = new FileSystemDataset.Builder().name(name).configuration(conf) .descriptor(descriptor) .partitionKey(descriptor.isPartitioned() ? com.cloudera.cdk.data.impl.Accessor.getDefault().newPartitionKey() : null) .build(); logger.debug("Loaded dataset:{}", ds); return ds; } @SuppressWarnings("deprecation") @Override public boolean delete(String name) { Preconditions.checkArgument(name != null, "Name can not be null"); logger.debug("Deleting dataset:{}", name); final DatasetDescriptor descriptor; try { descriptor = metadataProvider.load(name); } catch (com.cloudera.cdk.data.NoSuchDatasetException ex) { return false; } boolean changed; try { // don't care about the return value here -- if it already doesn't exist // we still need to delete the data directory changed = metadataProvider.delete(name); } catch (MetadataProviderException ex) { throw new DatasetRepositoryException("Failed to delete descriptor for name:" + name, ex); } final Path dataLocation = new Path(descriptor.getLocation()); final FileSystem fs = fsForPath(dataLocation, conf); try { if (fs.exists(dataLocation)) { if (fs.delete(dataLocation, true)) { changed = true; } else { throw new DatasetRepositoryException( "Failed to delete dataset name:" + name + " location:" + dataLocation); } } } catch (IOException e) { throw new DatasetRepositoryException("Internal failure when removing location:" + dataLocation); } return changed; } @Override public boolean exists(String name) { Preconditions.checkArgument(name != null, "Name can not be null"); return metadataProvider.exists(name); } @Override public Collection<String> list() { return metadataProvider.list(); } /** * Get a {@link com.cloudera.cdk.data.PartitionKey} corresponding to a partition's filesystem path * represented as a {@link URI}. If the path is not a valid partition, * then {@link IllegalArgumentException} is thrown. Note that the partition does not * have to exist. * @param dataset the filesystem dataset * @param partitionPath a directory path where the partition data is stored * @return a partition key representing the partition at the given path * @since 0.4.0 */ @SuppressWarnings("deprecation") public static PartitionKey partitionKeyForPath(Dataset dataset, URI partitionPath) { Preconditions.checkState(dataset.getDescriptor().isPartitioned(), "Attempt to get a partition on a non-partitioned dataset (name:%s)", dataset.getName()); Preconditions.checkArgument(dataset instanceof FileSystemDataset, "Dataset is not a FileSystemDataset"); FileSystemDataset fsDataset = (FileSystemDataset) dataset; FileSystem fs = fsDataset.getFileSystem(); URI partitionUri = fs.makeQualified(new Path(partitionPath)).toUri(); URI directoryUri = fsDataset.getDirectory().toUri(); URI relativizedUri = directoryUri.relativize(partitionUri); if (relativizedUri.equals(partitionUri)) { throw new IllegalArgumentException( String.format("Partition URI %s has different " + "root directory to dataset (directory: %s).", partitionUri, directoryUri)); } Iterable<String> parts = Splitter.on('/').split(relativizedUri.getPath()); PartitionStrategy partitionStrategy = dataset.getDescriptor().getPartitionStrategy(); List<FieldPartitioner> fieldPartitioners = partitionStrategy.getFieldPartitioners(); if (Iterables.size(parts) > fieldPartitioners.size()) { throw new IllegalArgumentException( String.format("Too many partition directories " + "for %s (%s), expecting %s.", partitionUri, Iterables.size(parts), fieldPartitioners.size())); } List<Object> values = Lists.newArrayList(); int i = 0; for (String part : parts) { Iterator<String> split = Splitter.on('=').split(part).iterator(); String fieldName = split.next(); FieldPartitioner fp = fieldPartitioners.get(i++); if (!fieldName.equals(fp.getName())) { throw new IllegalArgumentException( String.format("Unrecognized partition name " + "'%s' in partition %s, expecting '%s'.", fieldName, partitionUri, fp.getName())); } if (!split.hasNext()) { throw new IllegalArgumentException(String .format("Missing partition value for " + "'%s' in partition %s.", fieldName, partitionUri)); } String stringValue = split.next(); Object value = fp.valueFromString(stringValue); values.add(value); } return com.cloudera.cdk.data.impl.Accessor.getDefault() .newPartitionKey(values.toArray(new Object[values.size()])); } @Override public String toString() { return Objects.toStringHelper(this).add("metadataProvider", metadataProvider).toString(); } /** * @return the {@link MetadataProvider} being used by this repository. * @since 0.2.0 */ public MetadataProvider getMetadataProvider() { return metadataProvider; } /** * Creates, if necessary, the given the location for {@code descriptor}. * * @param conf A Configuration * @param descriptor A DatasetDescriptor */ static void ensureExists(DatasetDescriptor descriptor, Configuration conf) { Preconditions.checkArgument(descriptor.getLocation() != null, "Cannot get FileSystem for a descriptor with no location"); final Path dataPath = new Path(descriptor.getLocation()); final FileSystem fs = fsForPath(dataPath, conf); try { if (!fs.exists(dataPath)) { fs.mkdirs(dataPath); } } catch (IOException ex) { throw new DatasetRepositoryException("Cannot access data location", ex); } } private static FileSystem fsForPath(Path dataPath, Configuration conf) { try { return dataPath.getFileSystem(conf); } catch (IOException ex) { throw new DatasetRepositoryException("Cannot get FileSystem for descriptor", ex); } } /** * A fluent builder to aid in the construction of {@link FileSystemDatasetRepository} * instances. * @since 0.2.0 */ public static class Builder implements Supplier<FileSystemDatasetRepository> { private Path rootDirectory; private FileSystem fileSystem; private MetadataProvider metadataProvider; private Configuration configuration; /** * The root directory for metadata and dataset files. * * @param path a Path to a FileSystem location * @return this Builder for method chaining. */ public Builder rootDirectory(Path path) { this.rootDirectory = path; return this; } /** * The root directory for metadata and dataset files. * * @param uri a URI to a FileSystem location * @return this Builder for method chaining. */ public Builder rootDirectory(URI uri) { this.rootDirectory = new Path(uri); return this; } /** * The root directory for metadata and dataset files. * * @param uri a String to parse as a URI * @return this Builder for method chaining. * @throws URISyntaxException * * @since 0.8.0 */ public Builder rootDirectory(String uri) throws URISyntaxException { return rootDirectory(new URI(uri)); } /** * The {@link FileSystem} to store metadata and dataset files in (optional). * * The FileSystem for the root directory is used if this FileSystem is not * set. */ public Builder fileSystem(FileSystem fileSystem) { this.fileSystem = fileSystem; return this; } /** * The {@link MetadataProvider} for metadata storage (optional). If not * specified, a {@link FileSystemMetadataProvider} will be used. */ public Builder metadataProvider(MetadataProvider metadataProvider) { this.metadataProvider = metadataProvider; return this; } /** * The {@link Configuration} used to find the {@link FileSystem} (optional). * If not specified, the default configuration will be used. * @since 0.3.0 */ public Builder configuration(Configuration configuration) { this.configuration = configuration; return this; } /** * @deprecated will be removed in 0.11.0 */ @Override @Deprecated public FileSystemDatasetRepository get() { return build(); } /** * Build an instance of the configured {@link FileSystemDatasetRepository}. * * @since 0.9.0 */ public FileSystemDatasetRepository build() { if (configuration == null) { this.configuration = new Configuration(); } if (metadataProvider == null) { Preconditions.checkState(this.rootDirectory != null, "No root directory defined"); // the rootDirectory can have a scheme/authority that overrides if (fileSystem != null) { // if the FS doesn't match, this will throw IllegalArgumentException this.metadataProvider = new FileSystemMetadataProvider.Builder().configuration(configuration) .rootDirectory(fileSystem.makeQualified(rootDirectory)).build(); } else { this.metadataProvider = new FileSystemMetadataProvider.Builder().configuration(configuration) .rootDirectory(rootDirectory).build(); } } else { Preconditions.checkState(rootDirectory == null, "Root directory is ignored when a MetadataProvider is set"); Preconditions.checkState(fileSystem == null, "File system is ignored when a MetadataProvider is set"); } return new FileSystemDatasetRepository(configuration, metadataProvider); } } }