com.github.lbroudoux.elasticsearch.river.s3.connector.S3Connector.java Source code

Introduction

Here is the source code for com.github.lbroudoux.elasticsearch.river.s3.connector.S3Connector.java
Source

/*
 * Licensed to Laurent Broudoux (the "Author") under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Author licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package com.github.lbroudoux.elasticsearch.river.s3.connector;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;

import com.amazonaws.services.s3.model.*;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;

import com.amazonaws.auth.AWSCredentials;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.s3.AmazonS3Client;
import com.github.lbroudoux.elasticsearch.river.s3.river.S3RiverFeedDefinition;

/**
 * This is a connector for querying and retrieving files or folders from
 * an Amazon S3 bucket. Credentials are mandatory for connecting to remote drive.
 * @author laurent
 */
public class S3Connector {
    // This will only work if you can check presence in ES easily
    final int MAX_NEW_RESULTS_TO_INDEX_ON_RUN = 10000;

    private static final ESLogger logger = Loggers.getLogger(S3Connector.class);

    private final String accessKey;
    private final String secretKey;
    private String bucketName;
    private String pathPrefix;
    private AmazonS3Client s3Client;

    public S3Connector(String accessKey, String secretKey) {
        this.accessKey = accessKey;
        this.secretKey = secretKey;
    }

    /**
     * Connect to the specified bucket using previously given accesskey and secretkey.
     * @param bucketName Name of the bucket to connect to
     * @param pathPrefix Prefix that will be later used for filtering documents
     * @throws AmazonS3Exception when access or secret keys are wrong or bucket does not exists
     */
    public void connectUserBucket(String bucketName, String pathPrefix) throws AmazonS3Exception {
        this.bucketName = bucketName;
        this.pathPrefix = pathPrefix;
        AWSCredentials credentials = new BasicAWSCredentials(accessKey, secretKey);
        s3Client = new AmazonS3Client(credentials);
        // Getting location seems odd as we don't use it later and doesBucketExists() seems
        // more appropriate... However, this later returns true even for non existing buckets !
        s3Client.getBucketLocation(bucketName);
    }

    /**
     * Select and retrieves summaries of object into bucket and of given path prefix
     * that have modification date younger than lastScanTime.
     * @param lastScanTime Last modification date filter
     * @return Summaries of picked objects.
     */
    public S3ObjectSummaries getObjectSummaries(String riverName, Long lastScanTime, String initialScanBookmark,
            boolean trackS3Deletions) {
        List<String> keys = new ArrayList<String>();
        List<S3ObjectSummary> result = new ArrayList<S3ObjectSummary>();
        boolean initialScan = initialScanBookmark != null;

        if (initialScan) {
            trackS3Deletions = false;
            logger.info("{}: resuming initial scan of {} from {}", riverName, pathPrefix, initialScanBookmark);
        } else {
            logger.info("{}: checking {} for changes since {}", riverName, pathPrefix, lastScanTime);
        }

        // Store the scan time to return before doing big queries...
        Long lastScanTimeToReturn = System.currentTimeMillis();

        if (lastScanTime == null || initialScan) {
            lastScanTime = 0L;
        }

        ListObjectsRequest request = new ListObjectsRequest().withBucketName(bucketName).withPrefix(pathPrefix)
                .withEncodingType("url");
        ObjectListing listing = s3Client.listObjects(request);
        //logger.debug("Listing: {}", listing);
        int keyCount = 0;
        boolean scanTruncated = false;
        String lastKey = null;

        while (!listing.getObjectSummaries().isEmpty() || listing.isTruncated()) {
            List<S3ObjectSummary> summaries = listing.getObjectSummaries();
            // if (logger.isDebugEnabled()) {
            //    logger.debug("Found {} items in this listObjects page", summaries.size());
            // }

            for (S3ObjectSummary summary : summaries) {
                if (logger.isDebugEnabled()) {
                    // logger.debug("Getting {} last modified on {}", summary.getKey(), summary.getLastModified());
                }

                if (trackS3Deletions) {
                    keys.add(summary.getKey());
                }

                if (summary.getLastModified().getTime() > lastScanTime
                        && result.size() < MAX_NEW_RESULTS_TO_INDEX_ON_RUN) {
                    // logger.debug("  Picked !");

                    if (!initialScan || initialScanBookmark.compareTo(summary.getKey()) < 0) {
                        logger.debug("  Picked {}", summary.getKey());
                        result.add(summary);
                        lastKey = summary.getKey();
                    }

                } else if (!scanTruncated && result.size() == MAX_NEW_RESULTS_TO_INDEX_ON_RUN) {
                    logger.info("{}: only indexing up to {} new objects on this indexing run", riverName,
                            MAX_NEW_RESULTS_TO_INDEX_ON_RUN);
                    // initialScan = true;
                    scanTruncated = true;

                    if (!trackS3Deletions) {
                        // No need to keep iterating through all keys if we aren't doing deleteOnS3 
                        break;
                    }
                }

                keyCount += 1;
            }

            if (initialScan && scanTruncated && !trackS3Deletions) {
                break;
            }

            listing = s3Client.listNextBatchOfObjects(listing);
        }

        // Wrap results and latest scan time.
        if (scanTruncated) {
            logger.info("{}: scan truncated for speed: {} files ({} new)", riverName, keyCount, result.size());
        } else {
            logger.info("{}: complete scan: {} files ({} new)", riverName, keyCount, result.size());
        }

        return new S3ObjectSummaries(lastScanTimeToReturn, lastKey, scanTruncated, trackS3Deletions, result, keys);
    }

    public Map<String, Object> getS3UserMetadata(String key) {
        return Collections
                .<String, Object>unmodifiableMap(s3Client.getObjectMetadata(bucketName, key).getUserMetadata());
    }

    public String getDecodedKey(S3ObjectSummary summary) {
        //return summary.getKey();  // If you deactivate using withEncodingType above
        try {
            return java.net.URLDecoder.decode(summary.getKey(), "UTF-8");
        } catch (java.io.UnsupportedEncodingException e) {
            e.printStackTrace();
            return null;
        }
    }

    /**
     * Download Amazon S3 file as byte array.
     * @param summary The summary of the S3 Object to download
     * @return This file bytes or null if something goes wrong.
     */
    public byte[] getContent(S3ObjectSummary summary) {
        String key = getDecodedKey(summary);

        // Retrieve object corresponding to key into bucket.
        if (logger.isDebugEnabled()) {
            logger.debug("Downloading file content from {}", key);
        }

        S3Object object = s3Client.getObject(bucketName, key);

        InputStream is = null;
        ByteArrayOutputStream bos = null;

        try {
            // Get input stream on S3 Object.
            is = object.getObjectContent();
            bos = new ByteArrayOutputStream();

            byte[] buffer = new byte[4096];
            int len = is.read(buffer);
            while (len > 0) {
                bos.write(buffer, 0, len);
                len = is.read(buffer);
            }

            // Flush and return result.
            bos.flush();
            return bos.toByteArray();
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        } finally {
            if (bos != null) {
                try {
                    bos.close();
                } catch (IOException e) {
                }
            }
            try {
                is.close();
            } catch (IOException e) {
            }
        }
    }

    /**
     * Get the download url of this S3 object. May return null if the
     * object bucket and key cannot be converted to a URL.
     * @param summary A S3 object
     * @param feedDefinition The holder of S3 feed definition.
     * @return The resource url if possible (access is subject to AWS credential)
     */
    public String getDownloadUrl(S3ObjectSummary summary, S3RiverFeedDefinition feedDefinition) {
        String resourceUrl = s3Client.getResourceUrl(summary.getBucketName(), summary.getKey());
        // If a download host (actually a vhost such as cloudfront offers) is specified, use it to
        // recreate a vhosted resource url. This is made by substitution of the generic host name in url. 
        if (resourceUrl != null && feedDefinition.getDownloadHost() != null) {
            int hostPosEnd = resourceUrl.indexOf("s3.amazonaws.com/") + "s3.amazonaws.com".length();
            String vhostResourceUrl = feedDefinition.getDownloadHost() + resourceUrl.substring(hostPosEnd);
            return vhostResourceUrl;
        }
        return resourceUrl;
    }
}