Java tutorial
/** * Amazon Kinesis Aggregators * * Copyright 2014, Amazon.com, Inc. or its affiliates. All Rights Reserved. * * Licensed under the Amazon Software License (the "License"). * You may not use this file except in compliance with the License. * A copy of the License is located at * * http://aws.amazon.com/asl/ * * or in the "license" file accompanying this file. This file is distributed * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing * permissions and limitations under the License. */ package com.amazonaws.services.kinesis.aggregators.datastore; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.amazonaws.AmazonServiceException; import com.amazonaws.services.dynamodbv2.AmazonDynamoDB; import com.amazonaws.services.dynamodbv2.model.AttributeValue; import com.amazonaws.services.dynamodbv2.model.BatchGetItemRequest; import com.amazonaws.services.dynamodbv2.model.BatchGetItemResult; import com.amazonaws.services.dynamodbv2.model.ComparisonOperator; import com.amazonaws.services.dynamodbv2.model.Condition; import com.amazonaws.services.dynamodbv2.model.GetItemRequest; import com.amazonaws.services.dynamodbv2.model.KeysAndAttributes; import com.amazonaws.services.dynamodbv2.model.ProvisionedThroughputExceededException; import com.amazonaws.services.dynamodbv2.model.QueryRequest; import com.amazonaws.services.dynamodbv2.model.QueryResult; import com.amazonaws.services.dynamodbv2.model.ScanRequest; import com.amazonaws.services.dynamodbv2.model.ScanResult; import com.amazonaws.services.kinesis.aggregators.StreamAggregator; import com.amazonaws.services.kinesis.aggregators.StreamAggregatorUtils; import com.amazonaws.services.kinesis.aggregators.TableKeyStructure; public class DynamoQueryEngine { private final Log LOG = LogFactory.getLog(DynamoQueryEngine.class); private AmazonDynamoDB dynamoClient; private String tableName, labelAttribute, dateAttribute; protected final int BACKOFF_MILLIS = 10; public DynamoQueryEngine(AmazonDynamoDB dynamoClient, String tableName, String labelAttribute, String dateAttribute) { this.dynamoClient = dynamoClient; this.tableName = tableName; this.labelAttribute = labelAttribute; this.dateAttribute = dateAttribute; } public enum QueryKeyScope { HashKey, HashAndRangeKey; } public List<TableKeyStructure> parallelQueryKeys(QueryKeyScope scope, int threads) throws Exception { List<ParallelKeyScanWorker> workers = new ArrayList<>(); Collection<Future<?>> workerStatus = new ArrayList<>(); List<TableKeyStructure> output = new ArrayList<>(); int totalResultsProcessed = 0; // set up the executor thread pool ExecutorService executor = Executors.newFixedThreadPool(threads); // create workers for each segment that we need to do queries against for (int i = 0; i < threads; i++) { ParallelKeyScanWorker worker = new ParallelKeyScanWorker(this.tableName, i, threads, scope, this.labelAttribute, this.dateAttribute); workers.add(worker); workerStatus.add(executor.submit(worker)); } for (Future<?> f : workerStatus) { f.get(); } executor.shutdown(); for (ParallelKeyScanWorker w : workers) { // throw any exceptions the worker incurred w.throwExceptions(); if (w.getResultCount() > 0) { output.addAll(w.getOutput()); } totalResultsProcessed += w.getResultsProcessed(); } LOG.info(String.format("Key Extraction Complete - Processed %s Key Items", totalResultsProcessed)); return output; } public List<Map<String, AttributeValue>> queryByKey(String label, Date dateValue, ComparisonOperator operator) throws Exception { if (dateValue != null && !operator.equals(ComparisonOperator.EQ)) { String dateAsString = StreamAggregator.dateFormatter.format(dateValue); LOG.info(String.format("Issuing Hash/Range Query for %s - %s", label, dateAsString)); // range query Map<String, Condition> keyConditions = new HashMap<>(); // hash key Condition c = new Condition().withAttributeValueList(new AttributeValue().withS(label)) .withComparisonOperator(ComparisonOperator.EQ); keyConditions.put(this.labelAttribute, c); // range key c = new Condition().withAttributeValueList(new AttributeValue().withS(dateAsString)) .withComparisonOperator(operator); keyConditions.put(this.dateAttribute, c); QueryRequest req = new QueryRequest().withTableName(this.tableName).withKeyConditions(keyConditions); return DynamoUtils.queryUntilDone(dynamoClient, req, BACKOFF_MILLIS); } else { if (dateValue == null) { LOG.info(String.format("Issuing Hash Key Only Query for %s", label)); // hash key only query Map<String, Condition> keyConditions = new HashMap<>(); Condition c = new Condition().withAttributeValueList(new AttributeValue().withS(label)) .withComparisonOperator(ComparisonOperator.EQ); keyConditions.put(this.labelAttribute, c); QueryRequest req = new QueryRequest().withTableName(this.tableName) .withKeyConditions(keyConditions); return DynamoUtils.queryUntilDone(dynamoClient, req, BACKOFF_MILLIS); } else { String dateAsString = StreamAggregator.dateFormatter.format(dateValue); LOG.info(String.format("Performing exact Hash/Range Lookup for %s - %s", label, dateAsString)); // exact key lookup List<Map<String, AttributeValue>> output = new ArrayList<>(); Map<String, AttributeValue> keyMap = new HashMap<>(); keyMap.put(this.labelAttribute, new AttributeValue().withS(label)); keyMap.put(this.dateAttribute, new AttributeValue().withS(dateAsString)); GetItemRequest req = new GetItemRequest().withTableName(this.tableName).withKey(keyMap); output.add(this.dynamoClient.getItem(req).getItem()); return output; } } } private class ParallelKeyScanWorker implements Runnable { List<TableKeyStructure> output = new ArrayList<>(); private String tableName, hashKey, rangeKey; private QueryKeyScope scope; private int workerInstance, threads; private int resultsProcessed = 0; private Exception exception; public ParallelKeyScanWorker(String tableName, int workerInstance, int threads, QueryKeyScope scope, String hashKey, String rangeKey) { this.tableName = tableName; this.workerInstance = workerInstance; this.hashKey = hashKey; this.rangeKey = rangeKey; this.threads = threads; this.scope = scope; } public int getResultCount() { if (this.output == null) { return 0; } else { return this.output.size(); } } public int getResultsProcessed() { return this.resultsProcessed; } public void throwExceptions() throws Exception { if (this.exception != null) { throw this.exception; } } @Override public void run() { ScanRequest scanRequest = new ScanRequest().withTableName(this.tableName) .withAttributesToGet(this.hashKey).withSegment(this.workerInstance).withTotalSegments(threads); Map<String, Set<String>> deduplicated = new HashMap<>(); Set<String> rangeValues = null; Map<String, AttributeValue> lastKeyEvaluated = null; int scanAttempts = 0; int limit = -1; boolean returnedResults = false; String lastLabel = null; int uniqueLabels = 0; do { ScanResult result = null; // set query limits, to optimise for skip scan or for hash/range // query with no limit if (this.scope.equals(QueryKeyScope.HashKey)) { if (uniqueLabels > 0 && uniqueLabels == resultsProcessed) { // remove the query limit if every row being returned is // unique limit = -1; } else { // set a limit of twice the number of uniques, so we can // get a larger result set as we go if (uniqueLabels == 0) { limit = 100; } else { limit = uniqueLabels * 2; } // reset the unique labels so it doesn't grow without // limit uniqueLabels = 0; } } else { scanRequest.withAttributesToGet(this.rangeKey); } do { try { // set the limit if we have one if (limit != -1) { scanRequest.withLimit(limit); } result = dynamoClient.scan(scanRequest.withExclusiveStartKey(lastKeyEvaluated)); if (result.getItems().size() > 0) { returnedResults = true; } else { returnedResults = false; } } catch (ProvisionedThroughputExceededException e) { LOG.warn(String.format("Provisioned Throughput Exceeded - Retry Attempt %s", scanAttempts)); // back off try { Thread.sleep(2 ^ scanAttempts * BACKOFF_MILLIS); } catch (InterruptedException interrupted) { this.exception = interrupted; return; } scanAttempts++; } } while (scanAttempts < 10 && result == null); if (result == null) { this.exception = new Exception( String.format("Unable to execute Scan after %s attempts", scanAttempts)); return; } // process the results, creating a deduplicated map/set of // hash/range keys String labelValue = null; if (returnedResults) { for (Map<String, AttributeValue> map : result.getItems()) { resultsProcessed++; labelValue = map.get(this.hashKey).getS(); // only enter the label value into the hash once if (scope.equals(QueryKeyScope.HashKey)) { if (!labelValue.equals(lastLabel) || lastLabel == null) { deduplicated.put(labelValue, null); lastLabel = labelValue; uniqueLabels++; } } else { if (deduplicated.containsKey(labelValue)) { rangeValues = deduplicated.get(labelValue); } else { rangeValues = new HashSet<String>(); } rangeValues.add(map.get(this.rangeKey).getS()); deduplicated.put(labelValue, rangeValues); } } // set the last evaluated key. if we have processed a bunch // of data and are not at the end of the result set, then // we'll force a skip forward on date, to eliminate // continued processing of high cardinality hash values if (this.scope.equals(QueryKeyScope.HashKey) && result.getLastEvaluatedKey() != null) { // skip scan lastKeyEvaluated = new HashMap<>(); lastKeyEvaluated.put(this.hashKey, new AttributeValue().withS(labelValue)); lastKeyEvaluated.put(this.rangeKey, new AttributeValue().withS("4000-01-01 00:00:00")); } else { lastKeyEvaluated = result.getLastEvaluatedKey(); } } else { lastKeyEvaluated = null; } } while (lastKeyEvaluated != null); if (this.scope.equals(QueryKeyScope.HashKey)) { LOG.debug( String.format("Worker %s extracted %s results", this.workerInstance, deduplicated.size())); } else { LOG.debug(String.format("Worker %s deduplicated %s results, creating distinct set of %s keys", this.workerInstance, resultsProcessed, deduplicated.size())); } this.output = new ArrayList<>(); if (deduplicated.size() > 0) { for (String s : deduplicated.keySet()) { TableKeyStructure t = new TableKeyStructure(this.hashKey, s, this.rangeKey); if (scope.equals(QueryKeyScope.HashAndRangeKey)) { for (String rangeValue : deduplicated.get(s)) { t.withDateValue(rangeValue); } } output.add(t); } } } public List<TableKeyStructure> getOutput() { return this.output; } } private class ParallelDateQueryWorker implements Runnable { private int start, range; private String tableName, indexName, labelAttribute, dateAttribute; private Map<String, Condition> conditions; private Exception exception; private Map<String, Set<String>> resultKeys = new HashMap<>(); public void throwException() throws Exception { if (this.exception != null) throw this.exception; } public ParallelDateQueryWorker(String tableName, String indexName, int start, int range, Map<String, Condition> conditions, String labelAttribute, String dateAttribute) { this.tableName = tableName; this.indexName = indexName; this.start = start; this.range = range; this.conditions = conditions; this.labelAttribute = labelAttribute; this.dateAttribute = dateAttribute; } @Override public void run() { List<Map<String, AttributeValue>> results = new ArrayList<>(); for (int i = this.start; i < this.start + this.range; i++) { Condition c = new Condition().withComparisonOperator(ComparisonOperator.EQ) .withAttributeValueList(new AttributeValue().withN("" + i)); this.conditions.put(DynamoDataStore.SCATTER_PREFIX_ATTRIBUTE, c); QueryRequest req = new QueryRequest().withIndexName(this.indexName).withTableName(this.tableName) .withKeyConditions(this.conditions); Map<String, AttributeValue> lastKeyEvaluated = null; do { int queryAttempts = 0; QueryResult result = null; do { try { result = dynamoClient.query(req).withLastEvaluatedKey(lastKeyEvaluated); results.addAll(result.getItems()); } catch (ProvisionedThroughputExceededException e) { LOG.warn(String.format("Provisioned Throughput Exceeded - Retry Attempt %s", queryAttempts)); try { Thread.sleep(2 ^ queryAttempts * BACKOFF_MILLIS); } catch (InterruptedException interrupted) { this.exception = interrupted; return; } queryAttempts++; } } while (queryAttempts < 10 && result == null); if (result == null) { this.exception = new Exception( String.format("Unable to execute Query after %s attempts", queryAttempts)); return; } lastKeyEvaluated = result.getLastEvaluatedKey(); } while (lastKeyEvaluated != null); // pivot the results into a list of label values and set of date // values String labelValue = null; String dateValue = null; Set<String> values; for (Map<String, AttributeValue> map : results) { // process each attribute for (String s : map.keySet()) { // grab the label and date values if (s.equals(this.labelAttribute)) { labelValue = map.get(s).getS(); } else if (s.equals(this.dateAttribute)) { dateValue = map.get(s).getS(); } } if (labelValue != null && dateValue != null) { // get the current set of date values for the label, or // create a new one if (!resultKeys.containsKey(labelValue)) { values = new HashSet<>(); } else { values = resultKeys.get(labelValue); } // add the current date value to the set of all date // values // fore label values.add(dateValue); // write back the map of label to date values resultKeys.put(labelValue, values); } } } } public Map<String, Set<String>> getResultKeys() { return this.resultKeys; } } private KeysAndAttributes convertResultKeys(Map<String, Set<String>> resultKeys) { KeysAndAttributes keys = new KeysAndAttributes(); for (final String s : resultKeys.keySet()) { for (final String value : resultKeys.get(s)) { keys.withKeys(new HashMap<String, AttributeValue>() { { put(labelAttribute, new AttributeValue().withS(s)); put(dateAttribute, new AttributeValue().withS(value)); } }); } } return keys; } private List<Map<String, AttributeValue>> batchGetDataByKeys(final String tableName, final KeysAndAttributes keys) { Map<String, KeysAndAttributes> requestMap = new HashMap<>(); keys.setConsistentRead(true); requestMap.put(tableName, keys); BatchGetItemResult result = null; try { result = dynamoClient.batchGetItem(new BatchGetItemRequest(requestMap)); } catch (AmazonServiceException e) { LOG.error(e); throw e; } return result.getResponses().get(this.tableName); } @SuppressWarnings("unchecked") public List<Map<String, AttributeValue>> parallelQueryDate(String onAttribute, Map<String, Condition> conditions, int threads) throws Exception { // figure out the range of scatter prefix values we are going to assign // to each thread int range = (DynamoDataStore.SCATTER_WIDTH / threads) + 1; List<ParallelDateQueryWorker> workers = new ArrayList<>(); Collection<Future<?>> workerStatus = new ArrayList<>(); List<Map<String, AttributeValue>> output = new ArrayList<>(); // set up the executor thread pool ExecutorService executor = Executors.newFixedThreadPool(threads); // determine which index we should work with String indexName; if (onAttribute.equals(StreamAggregator.LAST_WRITE_SEQ)) { indexName = StreamAggregatorUtils.getLastWriteSeqIndexName(this.tableName); } else { indexName = StreamAggregatorUtils.getDateDimensionIndexName(this.tableName, onAttribute); } StringBuilder conditionString = new StringBuilder(); for (String s : conditions.keySet()) { conditionString.append(String.format("%s %s %s,", s, conditions.get(s).getComparisonOperator(), conditions.get(s).getAttributeValueList().get(0))); } LOG.info( String.format("Querying %s with %s Threads on %s (Conditions: %s)", indexName, threads, onAttribute, conditionString.length() > 0 ? conditionString.substring(0, conditionString.length() - 1).toString() : "None")); // create workers for each segment that we need to do queries against for (int i = 0; i < DynamoDataStore.SCATTER_WIDTH; i++) { if (i == 0 || i % range == 0) { ParallelDateQueryWorker worker = new ParallelDateQueryWorker(this.tableName, indexName, i, range, conditions, this.labelAttribute, this.dateAttribute); workers.add(worker); workerStatus.add(executor.submit(worker)); } } for (Future<?> f : workerStatus) { f.get(); } executor.shutdown(); // collect the results from the workers int outputCounter = 0; for (ParallelDateQueryWorker w : workers) { // throw any exceptions that the worker handled w.throwException(); // generate a set of KeysAndAttributes from the deduplicated output // map of table keys Map<String, Set<String>> workerKeys = w.getResultKeys(); KeysAndAttributes k = convertResultKeys(workerKeys); // break the KeysAndAttributes up into batches of 25 and // query for them KeysAndAttributes queryKeys = new KeysAndAttributes(); if (k != null && k.getKeys() != null) { for (Map<String, AttributeValue> key : k.getKeys()) { queryKeys.withKeys(key); outputCounter++; if (outputCounter % 25 == 0) { output.addAll(batchGetDataByKeys(this.tableName, queryKeys)); queryKeys = new KeysAndAttributes(); } } // one final query for anything < mod(25)=0 if (queryKeys.getKeys() != null && queryKeys.getKeys().size() > 0) { output.addAll(batchGetDataByKeys(this.tableName, queryKeys)); } } } return output; } }