Java tutorial
/* * Copyright 2013-2014 Amazon.com, Inc. or its affiliates. All Rights Reserved. * * Licensed under the Amazon Software License (the "License"). * You may not use this file except in compliance with the License. * A copy of the License is located at * * http://aws.amazon.com/asl/ * * or in the "license" file accompanying this file. This file is distributed * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing * permissions and limitations under the License. */ package com.amazonaws.services.kinesis.connectors.redshift; import java.io.ByteArrayInputStream; import java.io.IOException; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Properties; import java.util.SortedSet; import java.util.TreeSet; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.amazonaws.auth.AWSCredentials; import com.amazonaws.auth.AWSCredentialsProvider; import com.amazonaws.auth.AWSSessionCredentials; import com.amazonaws.services.kinesis.connectors.KinesisConnectorConfiguration; import com.amazonaws.services.kinesis.connectors.UnmodifiableBuffer; import com.amazonaws.services.kinesis.connectors.interfaces.IEmitter; import com.amazonaws.services.s3.AmazonS3Client; import com.amazonaws.services.s3.model.PutObjectRequest; /** * This implementation of IEmitter collects filenames from an Amazon Kinesis stream that has been started by * a S3ManifestEmitter. The RedshiftManifestEmitter processes the list of Amazon S3 file names, generates a * manifest file and performs an Amazon Redshift copy. The Amazon Redshift copy is done using transactions to * prevent duplication of objects in Amazon Redshift. <br> * It follows the following procedure: * <ol> * <li>Write manifest file to Amazon S3</li> * <li>Begin Amazon Redshift transaction</li> * <li>If any files already exist in Amazon Redshift, return and checkpoint (this transaction already completed * successfully so no need to duplicate)</li> * <li>Write file names to Amazon Redshift file table</li> * <li>Call Amazon Redshift copy</li> * <li>Commit Amazon Redshift Transaction</li> * </ol> * <p> * This class requires the configuration of an Amazon S3 bucket and endpoint, as well as the following Amazon Redshift * items: * <ul> * <li>Amazon Redshift URL</li> * <li>username and password</li> * <li>data table and key column (data table stores items from the manifest copy)</li> * <li>file table and key column (file table is used to store file names to prevent duplicate entries)</li> * <li>mandatory flag for Amazon Redshift copy</li> * <li>the delimiter used for string parsing when inserting entries into Amazon Redshift</li> * </ul> * <br> * NOTE: Amazon S3 bucket and Amazon Redshift table must be in the same region for Manifest Copy. */ public class RedshiftManifestEmitter implements IEmitter<String> { private static final Log LOG = LogFactory.getLog(RedshiftManifestEmitter.class); private final String s3Bucket; private final String dataTable; private final String fileTable; private final String fileKeyColumn; private final char dataDelimiter; private final AWSCredentialsProvider credentialsProvider; private final String s3Endpoint; private final AmazonS3Client s3Client; private final boolean copyMandatory; private final Properties loginProps; private final String redshiftURL; private static final String MANIFEST_PREFIX = "manifests/"; public RedshiftManifestEmitter(KinesisConnectorConfiguration configuration) { dataTable = configuration.REDSHIFT_DATA_TABLE; fileTable = configuration.REDSHIFT_FILE_TABLE; fileKeyColumn = configuration.REDSHIFT_FILE_KEY_COLUMN; dataDelimiter = configuration.REDSHIFT_DATA_DELIMITER; copyMandatory = configuration.REDSHIFT_COPY_MANDATORY; s3Bucket = configuration.S3_BUCKET; s3Endpoint = configuration.S3_ENDPOINT; s3Client = new AmazonS3Client(configuration.AWS_CREDENTIALS_PROVIDER); if (s3Endpoint != null) { s3Client.setEndpoint(s3Endpoint); } credentialsProvider = configuration.AWS_CREDENTIALS_PROVIDER; loginProps = new Properties(); loginProps.setProperty("user", configuration.REDSHIFT_USERNAME); loginProps.setProperty("password", configuration.REDSHIFT_PASSWORD); redshiftURL = configuration.REDSHIFT_URL; } @Override public List<String> emit(final UnmodifiableBuffer<String> buffer) throws IOException { List<String> records = buffer.getRecords(); Connection conn = null; String manifestFileName = getManifestFile(records); // Copy to Amazon Redshift using manifest file try { conn = DriverManager.getConnection(redshiftURL, loginProps); conn.setAutoCommit(false); List<String> deduplicatedRecords = checkForExistingFiles(conn, records); if (deduplicatedRecords.isEmpty()) { LOG.info("All the files in this set were already copied to Redshift."); // All of these files were already written rollbackAndCloseConnection(conn); records.clear(); return Collections.emptyList(); } if (deduplicatedRecords.size() != records.size()) { manifestFileName = getManifestFile(deduplicatedRecords); } // Write manifest file to Amazon S3 try { writeManifestToS3(manifestFileName, records); } catch (Exception e) { LOG.error("Error writing file " + manifestFileName + " to S3. Failing this emit attempt.", e); return buffer.getRecords(); } LOG.info("Inserting " + deduplicatedRecords.size() + " rows into the files table."); insertRecords(conn, deduplicatedRecords); LOG.info("Initiating Amazon Redshift manifest copy of " + deduplicatedRecords.size() + " files."); redshiftCopy(conn, manifestFileName); conn.commit(); LOG.info("Successful Amazon Redshift manifest copy of " + getNumberOfCopiedRecords(conn) + " records from " + deduplicatedRecords.size() + " files using manifest s3://" + s3Bucket + "/" + getManifestFile(records)); closeConnection(conn); return Collections.emptyList(); } catch (SQLException | IOException e) { LOG.error("Error copying data from manifest file " + manifestFileName + " into Amazon Redshift. Failing this emit attempt.", e); rollbackAndCloseConnection(conn); return buffer.getRecords(); } catch (Exception e) { LOG.error("Error copying data from manifest file " + manifestFileName + " into Redshift. Failing this emit attempt.", e); rollbackAndCloseConnection(conn); return buffer.getRecords(); } } private void rollbackAndCloseConnection(Connection conn) { try { if ((conn != null) && (!conn.isClosed())) { conn.rollback(); } } catch (Exception e) { LOG.error("Unable to rollback Amazon Redshift transaction.", e); } closeConnection(conn); } private void closeConnection(Connection conn) { try { if ((conn != null) && (!conn.isClosed())) { conn.close(); } } catch (Exception e) { LOG.error("Unable to close Amazon Redshift connection.", e); } } @Override public void fail(List<String> records) { for (String record : records) { LOG.error("Record failed: " + record); } } /** * Generates manifest file and writes it to Amazon S3 * * @param fileName Name of manifest file (Amazon S3 key) * @param records Used to generate the manifest file * @throws IOException */ private String writeManifestToS3(String fileName, List<String> records) throws IOException { String fileContents = generateManifestFile(records); // upload generated manifest file PutObjectRequest putObjectRequest = new PutObjectRequest(s3Bucket, fileName, new ByteArrayInputStream(fileContents.getBytes()), null); s3Client.putObject(putObjectRequest); return fileName; } /** * * Inserts the records to the fileTable using a SQL String in the format: INSERT INTO fileTable * VALUES ('f1'),('f2'),...; * * @param records * @throws IOException */ private void insertRecords(Connection conn, Collection<String> records) throws IOException { String toInsert = getCollectionString(records, "(", "),(", ")"); StringBuilder insertSQL = new StringBuilder(); insertSQL.append("INSERT INTO "); insertSQL.append(fileTable); insertSQL.append(" VALUES "); insertSQL.append(toInsert); insertSQL.append(";"); executeStatement(conn, insertSQL.toString()); } /** * Selects the count of files that are already present in Amazon Redshift using a SQL Query in the * format: SELECT COUNT(*) FROM fileTable WHERE fileKeyColumn IN ('f1','f2',...); * * @param records * @return Deduplicated list of files * @throws IOException */ private List<String> checkForExistingFiles(Connection conn, List<String> records) throws IOException { SortedSet<String> recordSet = new TreeSet<>(records); String files = getCollectionString(recordSet, "(", ",", ")"); StringBuilder selectExisting = new StringBuilder(); selectExisting.append("SELECT " + fileKeyColumn + " FROM "); selectExisting.append(fileTable); selectExisting.append(" WHERE "); selectExisting.append(fileKeyColumn); selectExisting.append(" IN "); selectExisting.append(files); selectExisting.append(";"); Statement stmt = null; ResultSet resultSet = null; try { stmt = conn.createStatement(); final String query = selectExisting.toString(); resultSet = stmt.executeQuery(query); while (resultSet.next()) { String existingFile = resultSet.getString(1); LOG.info("File " + existingFile + " has already been copied. Leaving it out."); recordSet.remove(existingFile); } resultSet.close(); stmt.close(); return new ArrayList<String>(recordSet); } catch (SQLException e) { try { resultSet.close(); } catch (Exception e1) { } try { stmt.close(); } catch (Exception e1) { } throw new IOException(e); } } private int getNumberOfCopiedRecords(Connection conn) throws IOException { String cmd = "select pg_last_copy_count();"; Statement stmt = null; ResultSet resultSet = null; try { stmt = conn.createStatement(); resultSet = stmt.executeQuery(cmd); resultSet.next(); int numCopiedRecords = resultSet.getInt(1); resultSet.close(); stmt.close(); return numCopiedRecords; } catch (SQLException e) { try { resultSet.close(); } catch (Exception e1) { } try { stmt.close(); } catch (Exception e1) { } throw new IOException(e); } } /** * Executes a, Amazon Redshift copy from Amazon S3 using a Manifest file with a command in the format: COPY * dataTable FROM 's3://s3Bucket/manifestFile' CREDENTIALS * 'aws_access_key_id=accessKey;aws_secret_access_key=secretKey' DELIMITER dataDelimiter * MANIFEST; * * @param Name of manifest file * @throws IOException */ protected void redshiftCopy(Connection conn, String manifestFile) throws IOException { AWSCredentials credentials = credentialsProvider.getCredentials(); StringBuilder redshiftCopy = new StringBuilder(); redshiftCopy.append("COPY " + dataTable + " "); redshiftCopy.append("FROM 's3://" + s3Bucket + "/" + manifestFile + "' "); redshiftCopy.append("CREDENTIALS '"); redshiftCopy.append("aws_access_key_id=" + credentials.getAWSAccessKeyId()); redshiftCopy.append(";"); redshiftCopy.append("aws_secret_access_key=" + credentials.getAWSSecretKey()); if (credentials instanceof AWSSessionCredentials) { redshiftCopy.append(";"); redshiftCopy.append("token=" + ((AWSSessionCredentials) credentials).getSessionToken()); } redshiftCopy.append("' "); redshiftCopy.append("DELIMITER '" + dataDelimiter + "' "); redshiftCopy.append("MANIFEST"); redshiftCopy.append(";"); executeStatement(conn, redshiftCopy.toString()); } /** * Helper function to execute SQL Statement with no results. Attempts to execute the statement * redshiftRetryLimit times. * * @param statement * @throws IOException */ private void executeStatement(Connection conn, String statement) throws IOException { try { Statement stmt = conn.createStatement(); stmt.execute(statement); stmt.close(); return; } catch (SQLException e) { LOG.error("Amazon S3 endpoint set to: " + s3Endpoint); LOG.error("Error executing statement: " + statement, e); throw new IOException(e); } } /** * Builds a String from the members of a Set of String * * @param members * List of String, each member will be surrounded by single quotes * @param prepend * beginning of String * @param delimiter * between each member * @param append * end of String * @return String in format: {prepend} * '{member1}'{delimiter}'{member2}'{delimiter}...'{lastMember}'{app e n d } */ private String getCollectionString(Collection<String> members, String prepend, String delimiter, String append) { StringBuilder s = new StringBuilder(); s.append(prepend); for (String m : members) { s.append("'"); s.append(m); s.append("'"); s.append(delimiter); } s.replace(s.length() - delimiter.length(), s.length(), ""); s.append(append); return s.toString(); } /** * Manifest file is named in the format manifests/{firstFileName}-{lastFileName} * * @param records * @return Manifest file name */ private String getManifestFile(List<String> records) { return MANIFEST_PREFIX + records.get(0) + "-" + records.get(records.size() - 1); } /** * Format for Amazon Redshift Manifest File: * * <pre> * { * "entries": [ * {"url":"s3://s3Bucket/file1","mandatory":true}, * {"url":"s3://s3Bucket/file2","mandatory":true}, * {"url":"s3://s3Bucket/file3","mandatory":true} * ] * } * * </pre> * * * @param files * @return String representation of Amazon S3 manifest file */ private String generateManifestFile(List<String> files) { StringBuilder s = new StringBuilder(); s.append("{\n"); s.append("\t\"entries\": [\n"); for (String file : files) { s.append("\t\t{"); s.append("\"url\":\"s3://"); s.append(s3Bucket); s.append("/"); s.append(file); s.append("\""); s.append(","); s.append("\"mandatory\":" + Boolean.toString(copyMandatory)); s.append("},\n"); } s.replace(s.length() - 2, s.length() - 1, ""); s.append("\t]\n"); s.append("}\n"); return s.toString(); } @Override public void shutdown() { s3Client.shutdown(); } }