org.apache.streams.s3.S3PersistReader.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.streams.s3.S3PersistReader.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.streams.s3;

import org.apache.streams.converter.LineReadWriteUtil;
import org.apache.streams.core.DatumStatusCountable;
import org.apache.streams.core.DatumStatusCounter;
import org.apache.streams.core.StreamsDatum;
import org.apache.streams.core.StreamsPersistReader;
import org.apache.streams.core.StreamsResultSet;

import com.amazonaws.ClientConfiguration;
import com.amazonaws.Protocol;
import com.amazonaws.auth.AWSCredentials;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.regions.Region;
import com.amazonaws.regions.Regions;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.S3ClientOptions;
import com.amazonaws.services.s3.model.ListObjectsRequest;
import com.amazonaws.services.s3.model.ObjectListing;
import com.amazonaws.services.s3.model.S3ObjectSummary;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Queues;
import org.apache.commons.lang3.StringUtils;
import org.joda.time.DateTime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;

/**
 * S3PersistReader reads documents from s3.
 */
public class S3PersistReader implements StreamsPersistReader, DatumStatusCountable {

    private static final Logger LOGGER = LoggerFactory.getLogger(S3PersistReader.class);
    public static final String STREAMS_ID = "S3PersistReader";
    protected static final char DELIMITER = '\t';

    private S3ReaderConfiguration s3ReaderConfiguration;
    private AmazonS3Client amazonS3Client;
    private ObjectMapper mapper = new ObjectMapper();
    protected LineReadWriteUtil lineReaderUtil;
    private Collection<String> files;
    private ExecutorService executor;
    protected volatile Queue<StreamsDatum> persistQueue;

    protected DatumStatusCounter countersTotal = new DatumStatusCounter();
    protected DatumStatusCounter countersCurrent = new DatumStatusCounter();
    private Future<?> task;

    public AmazonS3Client getAmazonS3Client() {
        return this.amazonS3Client;
    }

    public S3ReaderConfiguration getS3ReaderConfiguration() {
        return this.s3ReaderConfiguration;
    }

    public String getBucketName() {
        return this.s3ReaderConfiguration.getBucket();
    }

    public StreamsResultSet readNew(BigInteger sequence) {
        return null;
    }

    public StreamsResultSet readRange(DateTime start, DateTime end) {
        return null;
    }

    @Override
    public boolean isRunning() {
        return !task.isDone() && !task.isCancelled();
    }

    public DatumStatusCounter getDatumStatusCounter() {
        return countersTotal;
    }

    public Collection<String> getFiles() {
        return this.files;
    }

    public S3PersistReader(S3ReaderConfiguration s3ReaderConfiguration) {
        this.s3ReaderConfiguration = s3ReaderConfiguration;
    }

    @Override
    public String getId() {
        return STREAMS_ID;
    }

    @Override
    public void prepare(Object configurationObject) {

        lineReaderUtil = LineReadWriteUtil.getInstance(s3ReaderConfiguration);
        // Connect to S3
        synchronized (this) {
            // Create the credentials Object
            AWSCredentials credentials = new BasicAWSCredentials(s3ReaderConfiguration.getKey(),
                    s3ReaderConfiguration.getSecretKey());

            ClientConfiguration clientConfig = new ClientConfiguration();
            clientConfig.setProtocol(Protocol.valueOf(s3ReaderConfiguration.getProtocol().toString()));

            // We do not want path style access
            S3ClientOptions clientOptions = new S3ClientOptions();
            clientOptions.setPathStyleAccess(false);

            this.amazonS3Client = new AmazonS3Client(credentials, clientConfig);
            if (StringUtils.isNotEmpty(s3ReaderConfiguration.getRegion())) {
                this.amazonS3Client
                        .setRegion(Region.getRegion(Regions.fromName(s3ReaderConfiguration.getRegion())));
            }
            this.amazonS3Client.setS3ClientOptions(clientOptions);
        }

        final ListObjectsRequest request = new ListObjectsRequest()
                .withBucketName(this.s3ReaderConfiguration.getBucket())
                .withPrefix(s3ReaderConfiguration.getReaderPath()).withMaxKeys(500);

        ObjectListing listing = this.amazonS3Client.listObjects(request);

        this.files = new ArrayList<>();

        /*
         * If you can list files that are in this path, then you must be dealing with a directory
         * if you cannot list files that are in this path, then you are most likely dealing with
         * a simple file.
         */
        boolean hasCommonPrefixes = listing.getCommonPrefixes().size() > 0;
        boolean hasObjectSummaries = listing.getObjectSummaries().size() > 0;

        if (hasCommonPrefixes || hasObjectSummaries) {
            // Handle the 'directory' use case
            do {
                if (hasCommonPrefixes) {
                    for (String file : listing.getCommonPrefixes()) {
                        this.files.add(file);
                    }
                } else {
                    for (final S3ObjectSummary objectSummary : listing.getObjectSummaries()) {
                        this.files.add(objectSummary.getKey());
                    }
                }

                // get the next batch.
                listing = this.amazonS3Client.listNextBatchOfObjects(listing);
            } while (listing.isTruncated());
        } else {
            // handle the single file use-case
            this.files.add(s3ReaderConfiguration.getReaderPath());
        }

        if (this.files.size() <= 0) {
            LOGGER.error("There are no files to read");
        }

        this.persistQueue = Queues.synchronizedQueue(new LinkedBlockingQueue<StreamsDatum>(10000));
        this.executor = Executors.newSingleThreadExecutor();
    }

    public void cleanUp() {
        // no Op
    }

    public StreamsResultSet readAll() {
        startStream();
        return new StreamsResultSet(persistQueue);
    }

    public void startStream() {
        LOGGER.debug("startStream");
        task = executor.submit(new S3PersistReaderTask(this));
    }

    @Override
    public StreamsResultSet readCurrent() {

        StreamsResultSet current;

        synchronized (S3PersistReader.class) {
            current = new StreamsResultSet(new ConcurrentLinkedQueue<>(persistQueue));
            current.setCounter(new DatumStatusCounter());
            current.getCounter().add(countersCurrent);
            countersTotal.add(countersCurrent);
            countersCurrent = new DatumStatusCounter();
            persistQueue.clear();
        }
        return current;
    }

}