org.apache.druid.firehose.s3.StaticS3FirehoseFactory.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.druid.firehose.s3.StaticS3FirehoseFactory.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.firehose.s3;

import com.amazonaws.services.s3.model.AmazonS3Exception;
import com.amazonaws.services.s3.model.GetObjectRequest;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.S3Object;
import com.amazonaws.services.s3.model.S3ObjectSummary;
import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.Lists;
import org.apache.druid.data.input.FiniteFirehoseFactory;
import org.apache.druid.data.input.InputSplit;
import org.apache.druid.data.input.impl.StringInputRowParser;
import org.apache.druid.data.input.impl.prefetch.PrefetchableTextFilesFirehoseFactory;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.IOE;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.storage.s3.S3Utils;
import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3;
import org.apache.druid.utils.CompressionUtils;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;

/**
 * Builds firehoses that read from a predefined list of S3 objects and then dry up.
 */
public class StaticS3FirehoseFactory extends PrefetchableTextFilesFirehoseFactory<URI> {
    private static final Logger log = new Logger(StaticS3FirehoseFactory.class);
    private static final int MAX_LISTING_LENGTH = 1024;

    private final ServerSideEncryptingAmazonS3 s3Client;
    private final List<URI> uris;
    private final List<URI> prefixes;

    @JsonCreator
    public StaticS3FirehoseFactory(@JacksonInject("s3Client") ServerSideEncryptingAmazonS3 s3Client,
            @JsonProperty("uris") List<URI> uris, @JsonProperty("prefixes") List<URI> prefixes,
            @JsonProperty("maxCacheCapacityBytes") Long maxCacheCapacityBytes,
            @JsonProperty("maxFetchCapacityBytes") Long maxFetchCapacityBytes,
            @JsonProperty("prefetchTriggerBytes") Long prefetchTriggerBytes,
            @JsonProperty("fetchTimeout") Long fetchTimeout, @JsonProperty("maxFetchRetry") Integer maxFetchRetry) {
        super(maxCacheCapacityBytes, maxFetchCapacityBytes, prefetchTriggerBytes, fetchTimeout, maxFetchRetry);
        this.s3Client = Preconditions.checkNotNull(s3Client, "s3Client");
        this.uris = uris == null ? new ArrayList<>() : uris;
        this.prefixes = prefixes == null ? new ArrayList<>() : prefixes;

        if (!this.uris.isEmpty() && !this.prefixes.isEmpty()) {
            throw new IAE("uris and prefixes cannot be used together");
        }

        if (this.uris.isEmpty() && this.prefixes.isEmpty()) {
            throw new IAE("uris or prefixes must be specified");
        }

        for (final URI inputURI : this.uris) {
            Preconditions.checkArgument("s3".equals(inputURI.getScheme()), "input uri scheme == s3 (%s)", inputURI);
        }

        for (final URI inputURI : this.prefixes) {
            Preconditions.checkArgument("s3".equals(inputURI.getScheme()), "input uri scheme == s3 (%s)", inputURI);
        }
    }

    @JsonProperty
    public List<URI> getUris() {
        return uris;
    }

    @JsonProperty("prefixes")
    public List<URI> getPrefixes() {
        return prefixes;
    }

    @Override
    protected Collection<URI> initObjects() throws IOException {
        // Here, the returned s3 objects contain minimal information without data.
        // Getting data is deferred until openObjectStream() is called for each object.
        if (!uris.isEmpty()) {
            return uris;
        } else {
            final List<S3ObjectSummary> objects = new ArrayList<>();
            for (URI uri : prefixes) {
                final String bucket = uri.getAuthority();
                final String prefix = S3Utils.extractS3Key(uri);

                try {
                    final Iterator<S3ObjectSummary> objectSummaryIterator = S3Utils.objectSummaryIterator(s3Client,
                            bucket, prefix, MAX_LISTING_LENGTH);
                    objects.addAll(Lists.newArrayList(objectSummaryIterator));
                } catch (AmazonS3Exception outerException) {
                    log.error(outerException, "Exception while listing on %s", uri);

                    if (outerException.getStatusCode() == 403) {
                        // The "Access Denied" means users might not have a proper permission for listing on the given uri.
                        // Usually this is not a problem, but the uris might be the full paths to input objects instead of prefixes.
                        // In this case, users should be able to get objects if they have a proper permission for GetObject.

                        log.warn("Access denied for %s. Try to get the object from the uri without listing", uri);
                        try {
                            final ObjectMetadata objectMetadata = s3Client.getObjectMetadata(bucket, prefix);

                            if (!S3Utils.isDirectoryPlaceholder(prefix, objectMetadata)) {
                                objects.add(S3Utils.getSingleObjectSummary(s3Client, bucket, prefix));
                            } else {
                                throw new IOE("[%s] is a directory placeholder, "
                                        + "but failed to get the object list under the directory due to permission",
                                        uri);
                            }
                        } catch (AmazonS3Exception innerException) {
                            throw new IOException(innerException);
                        }
                    } else {
                        throw new IOException(outerException);
                    }
                }
            }
            return objects.stream().map(StaticS3FirehoseFactory::toUri).collect(Collectors.toList());
        }
    }

    @Override
    protected InputStream openObjectStream(URI object) throws IOException {
        try {
            // Get data of the given object and open an input stream
            final String bucket = object.getAuthority();
            final String key = S3Utils.extractS3Key(object);

            final S3Object s3Object = s3Client.getObject(bucket, key);
            if (s3Object == null) {
                throw new ISE("Failed to get an s3 object for bucket[%s] and key[%s]", bucket, key);
            }
            return s3Object.getObjectContent();
        } catch (AmazonS3Exception e) {
            throw new IOException(e);
        }
    }

    @Override
    protected InputStream openObjectStream(URI object, long start) throws IOException {
        final String bucket = object.getAuthority();
        final String key = S3Utils.extractS3Key(object);

        final GetObjectRequest request = new GetObjectRequest(bucket, key);
        request.setRange(start);
        try {
            final S3Object s3Object = s3Client.getObject(request);
            if (s3Object == null) {
                throw new ISE("Failed to get an s3 object for bucket[%s], key[%s], and start[%d]", bucket, key,
                        start);
            }
            return s3Object.getObjectContent();
        } catch (AmazonS3Exception e) {
            throw new IOException(e);
        }
    }

    @Override
    protected InputStream wrapObjectStream(URI object, InputStream stream) throws IOException {
        return CompressionUtils.decompress(stream, S3Utils.extractS3Key(object));
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) {
            return true;
        }
        if (o == null || getClass() != o.getClass()) {
            return false;
        }

        StaticS3FirehoseFactory that = (StaticS3FirehoseFactory) o;

        return Objects.equals(uris, that.uris) && Objects.equals(prefixes, that.prefixes)
                && getMaxCacheCapacityBytes() == that.getMaxCacheCapacityBytes()
                && getMaxFetchCapacityBytes() == that.getMaxFetchCapacityBytes()
                && getPrefetchTriggerBytes() == that.getPrefetchTriggerBytes()
                && getFetchTimeout() == that.getFetchTimeout() && getMaxFetchRetry() == that.getMaxFetchRetry();
    }

    @Override
    public int hashCode() {
        return Objects.hash(uris, prefixes, getMaxCacheCapacityBytes(), getMaxFetchCapacityBytes(),
                getPrefetchTriggerBytes(), getFetchTimeout(), getMaxFetchRetry());
    }

    @Override
    protected Predicate<Throwable> getRetryCondition() {
        return S3Utils.S3RETRY;
    }

    @Override
    public FiniteFirehoseFactory<StringInputRowParser, URI> withSplit(InputSplit<URI> split) {
        return new StaticS3FirehoseFactory(s3Client, Collections.singletonList(split.get()), null,
                getMaxCacheCapacityBytes(), getMaxFetchCapacityBytes(), getPrefetchTriggerBytes(),
                getFetchTimeout(), getMaxFetchRetry());
    }

    /**
     * Create an {@link URI} from the given {@link S3ObjectSummary}. The result URI is composed as below.
     *
     * <pre>
     * {@code s3://{BUCKET_NAME}/{OBJECT_KEY}}
     * </pre>
     */
    private static URI toUri(S3ObjectSummary object) {
        final String originalAuthority = object.getBucketName();
        final String originalPath = object.getKey();
        final String authority = originalAuthority.endsWith("/")
                ? originalAuthority.substring(0, originalAuthority.length() - 1)
                : originalAuthority;
        final String path = originalPath.startsWith("/") ? originalPath.substring(1) : originalPath;

        return URI.create(StringUtils.format("s3://%s/%s", authority, path));
    }
}