com.amazonaws.services.kinesis.connectors.redshift.RedshiftManifestEmitter.java Source code

Java tutorial

Introduction

Here is the source code for com.amazonaws.services.kinesis.connectors.redshift.RedshiftManifestEmitter.java

Source

/*
 * Copyright 2013-2014 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 *
 * Licensed under the Amazon Software License (the "License").
 * You may not use this file except in compliance with the License.
 * A copy of the License is located at
 *
 * http://aws.amazon.com/asl/
 *
 * or in the "license" file accompanying this file. This file is distributed
 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */
package com.amazonaws.services.kinesis.connectors.redshift;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.SortedSet;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.amazonaws.auth.AWSCredentials;
import com.amazonaws.auth.AWSCredentialsProvider;
import com.amazonaws.auth.AWSSessionCredentials;
import com.amazonaws.services.kinesis.connectors.KinesisConnectorConfiguration;
import com.amazonaws.services.kinesis.connectors.UnmodifiableBuffer;
import com.amazonaws.services.kinesis.connectors.interfaces.IEmitter;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.PutObjectRequest;

/**
 * This implementation of IEmitter collects filenames from an Amazon Kinesis stream that has been started by
 * a S3ManifestEmitter. The RedshiftManifestEmitter processes the list of Amazon S3 file names, generates a
 * manifest file and performs an Amazon Redshift copy. The Amazon Redshift copy is done using transactions to
 * prevent duplication of objects in Amazon Redshift. <br>
 * It follows the following procedure:
 * <ol>
 * <li>Write manifest file to Amazon S3</li>
 * <li>Begin Amazon Redshift transaction</li>
 * <li>If any files already exist in Amazon Redshift, return and checkpoint (this transaction already completed
 * successfully so no need to duplicate)</li>
 * <li>Write file names to Amazon Redshift file table</li>
 * <li>Call Amazon Redshift copy</li>
 * <li>Commit Amazon Redshift Transaction</li>
 * </ol>
 * <p>
 * This class requires the configuration of an Amazon S3 bucket and endpoint, as well as the following Amazon Redshift
 * items:
 * <ul>
 * <li>Amazon Redshift URL</li>
 * <li>username and password</li>
 * <li>data table and key column (data table stores items from the manifest copy)</li>
 * <li>file table and key column (file table is used to store file names to prevent duplicate entries)</li>
 * <li>mandatory flag for Amazon Redshift copy</li>
 * <li>the delimiter used for string parsing when inserting entries into Amazon Redshift</li>
 * </ul>
 * <br>
 * NOTE: Amazon S3 bucket and Amazon Redshift table must be in the same region for Manifest Copy.
 */
public class RedshiftManifestEmitter implements IEmitter<String> {
    private static final Log LOG = LogFactory.getLog(RedshiftManifestEmitter.class);
    private final String s3Bucket;
    private final String dataTable;
    private final String fileTable;
    private final String fileKeyColumn;
    private final char dataDelimiter;
    private final AWSCredentialsProvider credentialsProvider;
    private final String s3Endpoint;
    private final AmazonS3Client s3Client;
    private final boolean copyMandatory;
    private final Properties loginProps;
    private final String redshiftURL;
    private static final String MANIFEST_PREFIX = "manifests/";

    public RedshiftManifestEmitter(KinesisConnectorConfiguration configuration) {
        dataTable = configuration.REDSHIFT_DATA_TABLE;
        fileTable = configuration.REDSHIFT_FILE_TABLE;
        fileKeyColumn = configuration.REDSHIFT_FILE_KEY_COLUMN;
        dataDelimiter = configuration.REDSHIFT_DATA_DELIMITER;
        copyMandatory = configuration.REDSHIFT_COPY_MANDATORY;
        s3Bucket = configuration.S3_BUCKET;
        s3Endpoint = configuration.S3_ENDPOINT;
        s3Client = new AmazonS3Client(configuration.AWS_CREDENTIALS_PROVIDER);
        if (s3Endpoint != null) {
            s3Client.setEndpoint(s3Endpoint);
        }
        credentialsProvider = configuration.AWS_CREDENTIALS_PROVIDER;
        loginProps = new Properties();
        loginProps.setProperty("user", configuration.REDSHIFT_USERNAME);
        loginProps.setProperty("password", configuration.REDSHIFT_PASSWORD);
        redshiftURL = configuration.REDSHIFT_URL;
    }

    @Override
    public List<String> emit(final UnmodifiableBuffer<String> buffer) throws IOException {
        List<String> records = buffer.getRecords();
        Connection conn = null;

        String manifestFileName = getManifestFile(records);
        // Copy to Amazon Redshift using manifest file
        try {
            conn = DriverManager.getConnection(redshiftURL, loginProps);
            conn.setAutoCommit(false);
            List<String> deduplicatedRecords = checkForExistingFiles(conn, records);
            if (deduplicatedRecords.isEmpty()) {
                LOG.info("All the files in this set were already copied to Redshift.");
                // All of these files were already written
                rollbackAndCloseConnection(conn);
                records.clear();
                return Collections.emptyList();
            }

            if (deduplicatedRecords.size() != records.size()) {
                manifestFileName = getManifestFile(deduplicatedRecords);
            }
            // Write manifest file to Amazon S3
            try {
                writeManifestToS3(manifestFileName, records);
            } catch (Exception e) {
                LOG.error("Error writing file " + manifestFileName + " to S3. Failing this emit attempt.", e);
                return buffer.getRecords();
            }

            LOG.info("Inserting " + deduplicatedRecords.size() + " rows into the files table.");
            insertRecords(conn, deduplicatedRecords);
            LOG.info("Initiating Amazon Redshift manifest copy of " + deduplicatedRecords.size() + " files.");
            redshiftCopy(conn, manifestFileName);
            conn.commit();
            LOG.info("Successful Amazon Redshift manifest copy of " + getNumberOfCopiedRecords(conn)
                    + " records from " + deduplicatedRecords.size() + " files using manifest s3://" + s3Bucket + "/"
                    + getManifestFile(records));
            closeConnection(conn);
            return Collections.emptyList();
        } catch (SQLException | IOException e) {
            LOG.error("Error copying data from manifest file " + manifestFileName
                    + " into Amazon Redshift. Failing this emit attempt.", e);
            rollbackAndCloseConnection(conn);
            return buffer.getRecords();
        } catch (Exception e) {
            LOG.error("Error copying data from manifest file " + manifestFileName
                    + " into Redshift. Failing this emit attempt.", e);
            rollbackAndCloseConnection(conn);
            return buffer.getRecords();
        }
    }

    private void rollbackAndCloseConnection(Connection conn) {
        try {
            if ((conn != null) && (!conn.isClosed())) {
                conn.rollback();
            }
        } catch (Exception e) {
            LOG.error("Unable to rollback Amazon Redshift transaction.", e);
        }
        closeConnection(conn);
    }

    private void closeConnection(Connection conn) {
        try {
            if ((conn != null) && (!conn.isClosed())) {
                conn.close();
            }
        } catch (Exception e) {
            LOG.error("Unable to close Amazon Redshift connection.", e);
        }
    }

    @Override
    public void fail(List<String> records) {
        for (String record : records) {
            LOG.error("Record failed: " + record);
        }
    }

    /**
     * Generates manifest file and writes it to Amazon S3
     * 
     * @param fileName Name of manifest file (Amazon S3 key)
     * @param records Used to generate the manifest file
     * @throws IOException
     */
    private String writeManifestToS3(String fileName, List<String> records) throws IOException {
        String fileContents = generateManifestFile(records);
        // upload generated manifest file
        PutObjectRequest putObjectRequest = new PutObjectRequest(s3Bucket, fileName,
                new ByteArrayInputStream(fileContents.getBytes()), null);
        s3Client.putObject(putObjectRequest);
        return fileName;
    }

    /**
     * 
     * Inserts the records to the fileTable using a SQL String in the format: INSERT INTO fileTable
     * VALUES ('f1'),('f2'),...;
     * 
     * @param records
     * @throws IOException
     */
    private void insertRecords(Connection conn, Collection<String> records) throws IOException {
        String toInsert = getCollectionString(records, "(", "),(", ")");
        StringBuilder insertSQL = new StringBuilder();
        insertSQL.append("INSERT INTO ");
        insertSQL.append(fileTable);
        insertSQL.append(" VALUES ");
        insertSQL.append(toInsert);
        insertSQL.append(";");
        executeStatement(conn, insertSQL.toString());
    }

    /**
     * Selects the count of files that are already present in Amazon Redshift using a SQL Query in the
     * format: SELECT COUNT(*) FROM fileTable WHERE fileKeyColumn IN ('f1','f2',...);
     * 
     * @param records
     * @return Deduplicated list of files
     * @throws IOException
     */

    private List<String> checkForExistingFiles(Connection conn, List<String> records) throws IOException {
        SortedSet<String> recordSet = new TreeSet<>(records);
        String files = getCollectionString(recordSet, "(", ",", ")");
        StringBuilder selectExisting = new StringBuilder();
        selectExisting.append("SELECT " + fileKeyColumn + " FROM ");
        selectExisting.append(fileTable);
        selectExisting.append(" WHERE ");
        selectExisting.append(fileKeyColumn);
        selectExisting.append(" IN ");
        selectExisting.append(files);
        selectExisting.append(";");
        Statement stmt = null;
        ResultSet resultSet = null;
        try {
            stmt = conn.createStatement();
            final String query = selectExisting.toString();
            resultSet = stmt.executeQuery(query);
            while (resultSet.next()) {
                String existingFile = resultSet.getString(1);
                LOG.info("File " + existingFile + " has already been copied. Leaving it out.");
                recordSet.remove(existingFile);
            }
            resultSet.close();
            stmt.close();
            return new ArrayList<String>(recordSet);
        } catch (SQLException e) {
            try {
                resultSet.close();
            } catch (Exception e1) {
            }
            try {
                stmt.close();
            } catch (Exception e1) {
            }
            throw new IOException(e);
        }
    }

    private int getNumberOfCopiedRecords(Connection conn) throws IOException {
        String cmd = "select pg_last_copy_count();";
        Statement stmt = null;
        ResultSet resultSet = null;
        try {
            stmt = conn.createStatement();
            resultSet = stmt.executeQuery(cmd);
            resultSet.next();
            int numCopiedRecords = resultSet.getInt(1);
            resultSet.close();
            stmt.close();
            return numCopiedRecords;
        } catch (SQLException e) {
            try {
                resultSet.close();
            } catch (Exception e1) {
            }
            try {
                stmt.close();
            } catch (Exception e1) {
            }
            throw new IOException(e);
        }

    }

    /**
     * Executes a, Amazon Redshift copy from Amazon S3 using a Manifest file with a command in the format: COPY
     * dataTable FROM 's3://s3Bucket/manifestFile' CREDENTIALS
     * 'aws_access_key_id=accessKey;aws_secret_access_key=secretKey' DELIMITER dataDelimiter
     * MANIFEST;
     * 
     * @param Name of manifest file
     * @throws IOException
     */
    protected void redshiftCopy(Connection conn, String manifestFile) throws IOException {
        AWSCredentials credentials = credentialsProvider.getCredentials();
        StringBuilder redshiftCopy = new StringBuilder();
        redshiftCopy.append("COPY " + dataTable + " ");
        redshiftCopy.append("FROM 's3://" + s3Bucket + "/" + manifestFile + "' ");
        redshiftCopy.append("CREDENTIALS '");
        redshiftCopy.append("aws_access_key_id=" + credentials.getAWSAccessKeyId());
        redshiftCopy.append(";");
        redshiftCopy.append("aws_secret_access_key=" + credentials.getAWSSecretKey());
        if (credentials instanceof AWSSessionCredentials) {
            redshiftCopy.append(";");
            redshiftCopy.append("token=" + ((AWSSessionCredentials) credentials).getSessionToken());
        }
        redshiftCopy.append("' ");
        redshiftCopy.append("DELIMITER '" + dataDelimiter + "' ");
        redshiftCopy.append("MANIFEST");
        redshiftCopy.append(";");
        executeStatement(conn, redshiftCopy.toString());
    }

    /**
     * Helper function to execute SQL Statement with no results. Attempts to execute the statement
     * redshiftRetryLimit times.
     * 
     * @param statement
     * @throws IOException
     */
    private void executeStatement(Connection conn, String statement) throws IOException {
        try {
            Statement stmt = conn.createStatement();
            stmt.execute(statement);
            stmt.close();
            return;
        } catch (SQLException e) {
            LOG.error("Amazon S3 endpoint set to: " + s3Endpoint);
            LOG.error("Error executing statement: " + statement, e);
            throw new IOException(e);
        }
    }

    /**
     * Builds a String from the members of a Set of String
     * 
     * @param members
     *        List of String, each member will be surrounded by single quotes
     * @param prepend
     *        beginning of String
     * @param delimiter
     *        between each member
     * @param append
     *        end of String
     * @return String in format: {prepend}
     *         '{member1}'{delimiter}'{member2}'{delimiter}...'{lastMember}'{app e n d }
     */
    private String getCollectionString(Collection<String> members, String prepend, String delimiter,
            String append) {
        StringBuilder s = new StringBuilder();
        s.append(prepend);
        for (String m : members) {
            s.append("'");
            s.append(m);
            s.append("'");
            s.append(delimiter);
        }
        s.replace(s.length() - delimiter.length(), s.length(), "");
        s.append(append);
        return s.toString();
    }

    /**
     * Manifest file is named in the format manifests/{firstFileName}-{lastFileName}
     * 
     * @param records
     * @return Manifest file name
     */
    private String getManifestFile(List<String> records) {
        return MANIFEST_PREFIX + records.get(0) + "-" + records.get(records.size() - 1);
    }

    /**
     * Format for Amazon Redshift Manifest File:
     * 
     * <pre>
     * {
     *    "entries": [
     *       {"url":"s3://s3Bucket/file1","mandatory":true},
     *       {"url":"s3://s3Bucket/file2","mandatory":true},
     *       {"url":"s3://s3Bucket/file3","mandatory":true}
     *    ]
     * }
     * 
     * </pre>
     * 
     * 
     * @param files
     * @return String representation of Amazon S3 manifest file
     */
    private String generateManifestFile(List<String> files) {
        StringBuilder s = new StringBuilder();
        s.append("{\n");
        s.append("\t\"entries\": [\n");
        for (String file : files) {
            s.append("\t\t{");
            s.append("\"url\":\"s3://");
            s.append(s3Bucket);
            s.append("/");
            s.append(file);
            s.append("\"");
            s.append(",");
            s.append("\"mandatory\":" + Boolean.toString(copyMandatory));
            s.append("},\n");
        }
        s.replace(s.length() - 2, s.length() - 1, "");
        s.append("\t]\n");
        s.append("}\n");
        return s.toString();
    }

    @Override
    public void shutdown() {
        s3Client.shutdown();
    }

}