Java tutorial
/* * Copyright 2014 Mortar Data Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.samsung.px.pig.storage; import java.io.IOException; import java.nio.ByteBuffer; import java.util.*; import java.util.concurrent.LinkedBlockingQueue; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.ClusterStatus; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.OutputCommitter; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.pig.ResourceSchema; import org.apache.pig.ResourceSchema.ResourceFieldSchema; import org.apache.pig.StoreFunc; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.impl.util.UDFContext; import org.apache.pig.impl.util.Utils; import org.apache.pig.tools.pigstats.PigStatusReporter; import com.amazonaws.auth.BasicAWSCredentials; import com.amazonaws.services.dynamodb.AmazonDynamoDBClient; import com.amazonaws.services.dynamodb.model.AttributeValue; import com.amazonaws.services.dynamodb.model.BatchWriteItemRequest; import com.amazonaws.services.dynamodb.model.BatchWriteItemResult; import com.amazonaws.services.dynamodb.model.BatchWriteResponse; import com.amazonaws.services.dynamodb.model.DescribeTableRequest; import com.amazonaws.services.dynamodb.model.DescribeTableResult; import com.amazonaws.services.dynamodb.model.KeySchema; import com.amazonaws.services.dynamodb.model.PutRequest; import com.amazonaws.services.dynamodb.model.WriteRequest; import com.google.common.base.Stopwatch; import com.google.common.collect.Lists; import com.google.common.collect.Sets; public class DynamoDBStorage extends StoreFunc { private static final Log log = LogFactory.getLog(DynamoDBStorage.class); // counters public static final String DYNAMO_COUNTER_GROUP = "DynamoDBStorage"; public static final String DYNAMO_COUNTER_NULL_FIELDS_DISCARDED = "Null Fields Discarded"; public static final String DYNAMO_COUNTER_EMPTY_STRING_FIELDS_DISCARDED = "Empty String Fields Discarded"; public static final String DYNAMO_COUNTER_BYTES_WRITTEN = "Bytes Written"; public static final String DYNAMO_COUNTER_RECORDS_WRITTEN = "Records Written"; public static final String DYNAMO_COUNTER_CONSUMED_CAPACITY = "Consumed Capacity"; public static final String DYNAMO_COUNTER_RETRIES = "Retries"; // configuration properties public static final String MAX_RETRY_WAIT_MILLISECONDS_PROPERTY = "dynamodb.retry.wait.max"; public static final long MAX_RETRY_WAIT_MILLISECONDS_DEFAULT = 1000l * 60l * 2l; // maximum number of times to retry a write before giving up public static final String MAX_NUM_RETRIES_PER_BATCH_WRITE_PROPERTY = "dynamodb.retry.max_per_batch_write"; public static final int MAX_NUM_RETRIES_PER_BATCH_WRITE = 100; public static final String THROUGHPUT_WRITE_PERCENT_PROPERTY = "dynamodb.throughput.write.percent"; public static final float THROUGHPUT_WRITE_PERCENT_DEFAULT = 0.5f; // milliseconds to wait while throuphput is exhausted before checking again public static final long THROUGHPUT_WAIT_MS = 100; // minimum number of elements to require before sending a batch to dynamo public static final String MINIMUM_BATCH_SIZE_PROPERTY = "dynamodb.batch_size.min"; public static final int MINIMUM_BATCH_SIZE_DEFAULT = 15; private static final String SCHEMA_PROPERTY = "pig.dynamodbstorage.schema"; private static final int DYNAMO_MAX_ITEMS_IN_BATCH_WRITE_REQUEST = 25; private static final long DYNAMO_MAX_ITEM_SIZE_IN_BYTES = 65536; private static final long DYNAMO_MAX_CAPACITY_IN_WRITE_REQUEST = 1024; private String tableName = null; private String awsAccessKeyId = null; private String awsSecretKey = null; private String udfContextSignature = null; protected ResourceSchema schema = null; private AmazonDynamoDBClient dynamo = null; private long maxRetryWaitMilliseconds; private int maxNumRetriesPerBatchWrite; private double throughputWritePercent; private int minBatchSize; private HadoopJobInfo hadoopJobInfo = null; // token bucket private double maxWriteCapacity; private double currentWriteCapacity; private Stopwatch stopwatch = null; private DynamoWriteRequestBlockingQueue queue = null; DynamoDBStorage(String tableName, String awsAccessKeyId, String awsSecretKey, AmazonDynamoDBClient dynamo, HadoopJobInfo hadoopJobInfo) { this.tableName = tableName; this.awsAccessKeyId = awsAccessKeyId; this.awsSecretKey = awsSecretKey; this.dynamo = dynamo; this.hadoopJobInfo = hadoopJobInfo; } public DynamoDBStorage(String tableName, String awsAccessKeyId, String awsSecretKey) { this(tableName, awsAccessKeyId, awsSecretKey, null, null); } /** FRONTEND **/ @SuppressWarnings("rawtypes") @Override public OutputFormat getOutputFormat() throws IOException { return new DynamoDBOutputFormat(); } @Override public void setStoreFuncUDFContextSignature(String signature) { // Store the signature so we can use it later this.udfContextSignature = signature; } @Override public void checkSchema(ResourceSchema s) throws IOException { checkPigSchemaForDynamo(s); UDFContext udfc = UDFContext.getUDFContext(); Properties p = udfc.getUDFProperties(this.getClass(), new String[] { this.udfContextSignature }); p.setProperty(SCHEMA_PROPERTY, s.toString()); } /** FRONTEND and BACKEND **/ @Override public void setStoreLocation(String location, Job job) throws IOException { this.hadoopJobInfo = loadHadoopJobInfo(job); Configuration conf = this.hadoopJobInfo.getJobConfiguration(); this.maxRetryWaitMilliseconds = conf.getLong(MAX_RETRY_WAIT_MILLISECONDS_PROPERTY, MAX_RETRY_WAIT_MILLISECONDS_DEFAULT); this.maxNumRetriesPerBatchWrite = conf.getInt(MAX_NUM_RETRIES_PER_BATCH_WRITE_PROPERTY, MAX_NUM_RETRIES_PER_BATCH_WRITE); this.throughputWritePercent = new Float( conf.getFloat(THROUGHPUT_WRITE_PERCENT_PROPERTY, THROUGHPUT_WRITE_PERCENT_DEFAULT)).doubleValue(); if (this.throughputWritePercent < 0.1 || this.throughputWritePercent > 1.5) { throw new IOException(THROUGHPUT_WRITE_PERCENT_PROPERTY + " must be between 0.1 and 1.5. Got: " + this.throughputWritePercent); } this.minBatchSize = conf.getInt(MINIMUM_BATCH_SIZE_PROPERTY, MINIMUM_BATCH_SIZE_DEFAULT); if (this.minBatchSize < 1 || this.minBatchSize > DYNAMO_MAX_ITEMS_IN_BATCH_WRITE_REQUEST) { throw new IOException(MINIMUM_BATCH_SIZE_PROPERTY + " must be between 1 and " + DYNAMO_MAX_ITEMS_IN_BATCH_WRITE_REQUEST + ". Got: " + this.minBatchSize); } } /** BACKEND **/ @SuppressWarnings("rawtypes") @Override public void prepareToWrite(RecordWriter writer) throws IOException { // Get the schema string from the UDFContext object. UDFContext udfc = UDFContext.getUDFContext(); Properties p = udfc.getUDFProperties(this.getClass(), new String[] { this.udfContextSignature }); String strSchema = p.getProperty(SCHEMA_PROPERTY); if (strSchema == null) { throw new IOException("Could not find schema in UDF context at property " + SCHEMA_PROPERTY); } // Parse the schema from the string stored in the properties object. this.schema = new ResourceSchema(Utils.getSchemaFromString(strSchema)); // connect to dynamo this.dynamo = loadDynamoDB(); // fetch capacity we are allowed to use this.maxWriteCapacity = getMaxWriteCapacity(); this.currentWriteCapacity = this.maxWriteCapacity; this.queue = new DynamoWriteRequestBlockingQueue(); // create and start the stopwatch this.stopwatch = new Stopwatch().start(); } public void putNext(Tuple tuple) throws IOException { WriteRequestWithCapacity request = getWriteRequestWithCapacity(tuple); // loop until we've successfully enqueued our request while (request != null) { resetCurrentWriteCapacity(); if (shouldDoBatchWrite(this.minBatchSize)) { submitBatchWriteItemRequest(); } if (this.queue.offer(request)) { log.debug("Successfully added item to queue. queue size: " + this.queue.size()); break; } else { // pause for a bit to let the queue unfill try { Thread.sleep(THROUGHPUT_WAIT_MS); } catch (InterruptedException e) { throw new RuntimeException(e); } } } } /** HELPERS * @throws IOException **/ private void checkPigSchemaForDynamo(ResourceSchema schema) throws IOException { // extract field names Set<String> fieldNames = Sets.newHashSetWithExpectedSize(schema.getFields().length); for (ResourceFieldSchema field : schema.getFields()) { String fieldName = field.getName(); if (fieldNames.contains(fieldName)) { throw new IOException( "Schema cannot contain duplicated field name. Found duplicated: " + fieldName); } if (field.getType() == DataType.MAP || // field.getType() == DataType.TUPLE || field.getType() == DataType.BAG) { throw new IOException( "DynamoDBStorage can not store map, or bag types. Found one in field name: " + fieldName); } fieldNames.add(fieldName); } // ensure that Dynamo table primary keys are found in field names DescribeTableResult describe = describeDynamoTable(); KeySchema dynamoKeySchema = describe.getTable().getKeySchema(); if (dynamoKeySchema.getHashKeyElement() != null) { String expectedFieldName = dynamoKeySchema.getHashKeyElement().getAttributeName(); if (!fieldNames.contains(expectedFieldName)) { throw new IOException("Dynamo table " + this.tableName + " hash primary key [" + expectedFieldName + "] not found in " + " pig schema fields: " + fieldNames); } } if (dynamoKeySchema.getRangeKeyElement() != null) { String expectedFieldName = dynamoKeySchema.getRangeKeyElement().getAttributeName(); if (!fieldNames.contains(expectedFieldName)) { throw new IOException("Dynamo table " + this.tableName + " range secondary key [" + expectedFieldName + "] not found in " + " pig schema fields: " + fieldNames); } } } private HadoopJobInfo loadHadoopJobInfo(Job job) { if (this.hadoopJobInfo == null) { this.hadoopJobInfo = new HadoopJobInfo(job); } return this.hadoopJobInfo; } private AmazonDynamoDBClient loadDynamoDB() { if (this.dynamo == null) { this.dynamo = new AmazonDynamoDBClient(new BasicAWSCredentials(this.awsAccessKeyId, this.awsSecretKey)); } return this.dynamo; } long getMaxWriteCapacity() throws IOException { // grab the full capacity of the dynamo table long fullTableWriteCapacity = getDynamoTableWriteCapacity(); // grab the number of tasks for our portion of the Job int numTasksForStore = this.hadoopJobInfo.getNumTasksForStore(); // grab the max number of tasks that could run at once int maxSlotsForStore = this.hadoopJobInfo.getNumSlotsForStore(); // the maximum number of concurrent tasks will be // the minimum of the tasks in the job and the slots available to run them int maxConcurrentTasks = Math.min(numTasksForStore, maxSlotsForStore); // calculate full table write capacity per running task double fullCapacityPerTask = new Double(fullTableWriteCapacity) / new Double(maxConcurrentTasks); // modulate this by the amount of write capacity requested by the user // and cast down to a long to truncate (be conservative) Double capacityPerTaskDbl = fullCapacityPerTask * this.throughputWritePercent; long capacityPerTask = Math.max(capacityPerTaskDbl.longValue(), 1); log.info("Allocating [" + capacityPerTask + "] write capacity units to this " + this.hadoopJobInfo.getMapOrReduce() + " task; full table capacity: [" + fullTableWriteCapacity + "]," + " numTasksForStore: [" + numTasksForStore + "]," + " maxSlotsForStore: [" + maxSlotsForStore + "]" + " requested write throughput pct: [" + this.throughputWritePercent + "]"); return capacityPerTask; } private DescribeTableResult describeDynamoTable() { DescribeTableRequest request = new DescribeTableRequest().withTableName(this.tableName); return loadDynamoDB().describeTable(request); } private long getDynamoTableWriteCapacity() { DescribeTableResult result = describeDynamoTable(); return result.getTable().getProvisionedThroughput().getWriteCapacityUnits(); } private void drainQueue() { while (this.queue.size() > 0) { resetCurrentWriteCapacity(); if (shouldDoBatchWrite(1)) { submitBatchWriteItemRequest(); } // pause for a bit to let the queue unfill try { Thread.sleep(THROUGHPUT_WAIT_MS); } catch (InterruptedException e) { throw new RuntimeException(e); } } } private boolean shouldDoBatchWrite(int minimumBatchSize) { // only send writes if we have a minimum batch size ready // and if our queue has enough to process it // a better way would be to know how much capcity is in the queue // and figure out what the expected capcaity is for that batch size. if (this.queue.size() >= minimumBatchSize) { // figure out how big each item is on average long queueCapacity = this.queue.getQueueCapacity(); double averageItemCapacity = (double) queueCapacity / this.queue.size(); double capacityRequiredToSendMinimumBatchSize = averageItemCapacity * minimumBatchSize; // write if we either have the capacity to cover this write // or we would never have the capacity to cover a full write return ((capacityRequiredToSendMinimumBatchSize <= this.currentWriteCapacity) || (capacityRequiredToSendMinimumBatchSize >= this.maxWriteCapacity)); } else { return false; } //return (this.queue.size() >= minimumBatchSize) && // (this.currentWriteCapacity >= (0.5 * this.maxWriteCapacity)); } private void resetCurrentWriteCapacity() { // check the elapsed time long elapsedMillis = this.stopwatch.stop().elapsedMillis(); double elapsedSeconds = ((double) elapsedMillis) / 1000; // restart the timer this.stopwatch.reset().start(); // we earn fullWriteCapacity every second double earnedCapacity = elapsedSeconds * this.maxWriteCapacity; // never set the current capacity higher than the full capacity this.currentWriteCapacity = Math.min(this.maxWriteCapacity, this.currentWriteCapacity + earnedCapacity); } private WriteRequestWithCapacity getWriteRequestWithCapacity(Tuple tuple) throws IOException { ResourceFieldSchema[] fields = this.schema.getFields(); Map<String, AttributeValue> dynamoItem = new HashMap<String, AttributeValue>(); int dataSize = 0; int dynamoItemSize = 0; int tupleSize = tuple.size(); for (int i = 0; i < tupleSize; i++) { Object field = tuple.get(i); AttributeValue dynamoValue = null; switch (DataType.findType(field)) { case DataType.NULL: // dynamodb does not support null values // simply don't write field reportCounter(DYNAMO_COUNTER_NULL_FIELDS_DISCARDED, 1); break; case DataType.BOOLEAN: if (((Boolean) field).booleanValue()) { dynamoValue = new AttributeValue().withN("1"); } else { dynamoValue = new AttributeValue().withN("0"); } dataSize += 1; dynamoItemSize += 1; break; case DataType.INTEGER: case DataType.LONG: case DataType.FLOAT: case DataType.DOUBLE: String numAsString = field.toString(); dynamoValue = new AttributeValue().withN(numAsString); dataSize += numAsString.length(); dynamoItemSize += numAsString.length(); break; case DataType.BYTEARRAY: byte[] b = ((DataByteArray) field).get(); ByteBuffer buffer = ByteBuffer.allocate(b.length); buffer.put(b, 0, b.length); buffer.position(0); dynamoValue = new AttributeValue().withB(buffer); dataSize += b.length; dynamoItemSize += b.length; break; case DataType.CHARARRAY: String fieldStr = (String) field; int fieldLen = fieldStr.length(); if (fieldLen > 0) { dynamoValue = new AttributeValue().withS(fieldStr); dataSize += fieldLen; dynamoItemSize += fieldLen; } else { // DynamoDB cannot handle empty strings reportCounter(DYNAMO_COUNTER_EMPTY_STRING_FIELDS_DISCARDED, 1); } break; case DataType.BYTE: ByteBuffer buf = ByteBuffer.allocate(1); buf.put((Byte) field); buf.position(0); dynamoValue = new AttributeValue().withB(buf); dataSize += 1; dynamoItemSize += 1; break; case DataType.MAP: case DataType.TUPLE: Tuple listTuple = (Tuple) field; if (listTuple.size() > 0) { Collection<String> listOfValues = new ArrayList<String>(); for (int k = 0; k < listTuple.size(); k++) { String strItem = (String) listTuple.get(k); int itemLen = strItem.length(); dataSize += itemLen; dynamoItemSize += itemLen; listOfValues.add(strItem); } dynamoValue = new AttributeValue().withSS(listOfValues); } else { // DynamoDB cannot handle empty strings reportCounter(DYNAMO_COUNTER_EMPTY_STRING_FIELDS_DISCARDED, 1); } break; case DataType.BAG: throw new RuntimeException("DynamoDBStorage does not support Maps, Tuples or Bags"); } if (dynamoValue != null) { ResourceFieldSchema fieldSchema = fields[i]; String fieldName = fieldSchema.getName(); if (fieldName == null) { throw new IllegalArgumentException( "Cannot write a field with no name (element " + i + " ) FieldSchema: " + fields); } dynamoItemSize += fieldName.length(); dynamoItem.put(fieldName, dynamoValue); } } // check for max item size if (dynamoItemSize > DYNAMO_MAX_ITEM_SIZE_IN_BYTES) { throw new RuntimeException("Item size " + dynamoItemSize + " bytes is larger than max dynamo item size " + DYNAMO_MAX_ITEM_SIZE_IN_BYTES + ". Aborting. Item: " + dynamoItem); } WriteRequest writeRequest = new WriteRequest().withPutRequest(new PutRequest().withItem(dynamoItem)); return new WriteRequestWithCapacity(writeRequest, dynamoItemSize, dataSize); } private void submitBatchWriteItemRequest() { long capacityConsumed = 0; List<WriteRequest> writeRequests = Lists.newArrayListWithCapacity(DYNAMO_MAX_ITEMS_IN_BATCH_WRITE_REQUEST); // fill up the queue (pass in the floor of current capacity to be conservative) long bytesToWrite = this.queue.drainTo(writeRequests, (long) this.currentWriteCapacity, (long) this.maxWriteCapacity); int numWriteRequests = writeRequests.size(); // nothing to do if (numWriteRequests == 0) { return; } // send the data over Map<String, List<WriteRequest>> unprocessedItems = new HashMap<String, List<WriteRequest>>(1); unprocessedItems.put(this.tableName, writeRequests); for (int currentRetry = 0; currentRetry < this.maxNumRetriesPerBatchWrite; currentRetry += 1) { if (currentRetry > 0) { reportCounter(DYNAMO_COUNTER_RETRIES, 1); } BatchWriteItemRequest request = new BatchWriteItemRequest().withRequestItems(unprocessedItems); BatchWriteItemResult result = this.dynamo.batchWriteItem(request); unprocessedItems = result.getUnprocessedItems(); // track capacity used capacityConsumed += getConsumedCapacity(result); if (unprocessedItems.isEmpty()) { reportCounter(DYNAMO_COUNTER_CONSUMED_CAPACITY, capacityConsumed); reportCounter(DYNAMO_COUNTER_BYTES_WRITTEN, bytesToWrite); // reduce capacity this.currentWriteCapacity -= capacityConsumed; //log.debug("Successfully sent " + numWriteRequests + // " records to dynamo, using write capacity: " + capacityConsumed + // ", new available capacity: " + this.currentWriteCapacity); // success break; } else { long retryMs = getRetryMs(currentRetry); log.info("Pausing " + retryMs + " ms before retrying write for " + unprocessedItems.get(this.tableName).size() + " items to Dynamo. Retries so far: " + currentRetry); try { Thread.sleep(retryMs); } catch (InterruptedException e) { throw new RuntimeException(e); } } } if (!unprocessedItems.isEmpty()) { throw new RuntimeException( "Out of retries trying to add items to DynamoDB table. Unprocessed items: " + unprocessedItems); } // track bytes and records written reportCounter(DYNAMO_COUNTER_RECORDS_WRITTEN, numWriteRequests); } private void reportCounter(String counterName, long incrementValue) { PigStatusReporter reporter = PigStatusReporter.getInstance(); if (reporter != null) { Counter counter = reporter.getCounter(DYNAMO_COUNTER_GROUP, counterName); if (counter != null) { counter.increment(incrementValue); } } } private long getConsumedCapacity(BatchWriteItemResult result) { double consumedCapacity = 0; for (BatchWriteResponse response : result.getResponses().values()) { consumedCapacity += response.getConsumedCapacityUnits(); } return new Double(consumedCapacity).longValue(); } private long getRetryMs(int retryNum) { // max retry wait double calculatedRetry = Math.pow(2, retryNum) * 50; return new Double(Math.min(calculatedRetry, this.maxRetryWaitMilliseconds)).longValue(); } /** OUTPUT FORMAT **/ class NoopRecordWriter extends RecordWriter<NullWritable, NullWritable> { @Override public void close(TaskAttemptContext arg0) throws IOException, InterruptedException { // IGNORE } @Override public void write(NullWritable arg0, NullWritable arg1) throws IOException, InterruptedException { // IGNORE } } class DynamoDBOutputFormat extends OutputFormat<NullWritable, NullWritable> { @Override public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException { // IGNORE } @Override public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException { return new OutputCommitter() { @Override public void abortTask(TaskAttemptContext context) throws IOException { drainQueue(); } @Override public void commitTask(TaskAttemptContext context) throws IOException { drainQueue(); } @Override public boolean needsTaskCommit(TaskAttemptContext context) throws IOException { return true; } // @Override // public void cleanupJob(JobContext context) throws IOException { // // IGNORE // } @Override public void setupJob(JobContext context) throws IOException { // IGNORE } @Override public void setupTask(TaskAttemptContext context) throws IOException { // IGNORE } }; } @Override public RecordWriter<NullWritable, NullWritable> getRecordWriter(TaskAttemptContext arg0) throws IOException, InterruptedException { return new NoopRecordWriter(); } } class WriteRequestWithCapacity { private WriteRequest writeRequest; private long dataSize; private long capacity; WriteRequestWithCapacity(WriteRequest writeRequest, long dynamoItemSize, long dataSize) { this.writeRequest = writeRequest; this.dataSize = dataSize; this.capacity = calculateCapacity(dynamoItemSize); } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + getOuterType().hashCode(); result = prime * result + (int) (capacity ^ (capacity >>> 32)); result = prime * result + ((writeRequest == null) ? 0 : writeRequest.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; WriteRequestWithCapacity other = (WriteRequestWithCapacity) obj; if (!getOuterType().equals(other.getOuterType())) return false; if (capacity != other.capacity) return false; if (writeRequest == null) { if (other.writeRequest != null) return false; } else if (!writeRequest.equals(other.writeRequest)) return false; return true; } private DynamoDBStorage getOuterType() { return DynamoDBStorage.this; } @Override public String toString() { return "WriteRequestWithCapacity [writeRequest=" + writeRequest + ", capacity=" + capacity + "]"; } public WriteRequest getWriteRequest() { return writeRequest; } public long getCapacity() { return capacity; } public long getDataSizeInBytes() { return dataSize; } /** * Calculate the write capacity that would be needed * to process a number of bytes. * * @param bytes * @return */ private long calculateCapacity(long bytes) { // consumed capacity = ceiling ( kb ); double capacity = Math.ceil(((double) bytes) / 1024); // round to nearest whole number to deal w/ floating point arithmetic imprecision return Math.round(capacity); } } class DynamoWriteRequestBlockingQueue { private Queue<WriteRequestWithCapacity> queue; private long queueCapacity; public DynamoWriteRequestBlockingQueue() { this.queue = new LinkedBlockingQueue<WriteRequestWithCapacity>(DYNAMO_MAX_ITEMS_IN_BATCH_WRITE_REQUEST); this.queueCapacity = 0; } /** * Inserts the specified element at the tail of this queue if it * is possible to do so immediately without exceeding the queue's capacity, * returning true upon success and false if this queue is full. * * @param w * @return */ public boolean offer(WriteRequestWithCapacity request) { // first: check whether adding this would go over the max bytes per batch request if ((this.queueCapacity + request.getCapacity()) > DYNAMO_MAX_CAPACITY_IN_WRITE_REQUEST) { log.debug("Blocking message with capacity " + request.getCapacity() + " b/c we already have " + this.queueCapacity + " in queue, limit: " + DYNAMO_MAX_CAPACITY_IN_WRITE_REQUEST); return false; } // next: check whether we have enough room for it as an item boolean offerResponse = this.queue.offer(request); if (offerResponse) { this.queueCapacity += request.getCapacity(); } return offerResponse; } /** * * @return number of bytes drained */ public long drainTo(Collection<WriteRequest> c, long currentCapacity, long maxCapacity) { long drainedCapacity = 0; long bytesDrained = 0; while (true) { WriteRequestWithCapacity peek = this.queue.peek(); // no more elements, we're done if (peek == null) { return bytesDrained; } // we will return the element if: // - adding it would not exceed our currentCapacity // - currentCapacity == maxCapacity and the element is bigger than maxCapacity (special case for huge items) if ((peek.getCapacity() + drainedCapacity <= currentCapacity) || ((peek.getCapacity() > maxCapacity) && (currentCapacity == maxCapacity))) { WriteRequestWithCapacity removed = this.queue.remove(); c.add(removed.getWriteRequest()); drainedCapacity += removed.getCapacity(); this.queueCapacity -= removed.getCapacity(); bytesDrained += removed.getDataSizeInBytes(); } else { // item is too big, we're done return bytesDrained; } } } public long getQueueCapacity() { return this.queueCapacity; } public int size() { return this.queue.size(); } @Override public String toString() { return "DynamoWriteRequestBlockingQueue [queue=" + queue + ", queueCapacity=" + queueCapacity + "]"; } } class HadoopJobInfo { private Job job; HadoopJobInfo(Job job) { this.job = job; } public String getMapOrReduce() { return isStoreInMapSide() ? "map" : "reduce"; } public int getNumTasksForStore() { return isStoreInMapSide() ? getNumMapTasks() : getNumReduceTasks(); } public int getNumSlotsForStore() throws IOException { ClusterStatus clusterStatus = getClusterStatus(); // get the number of slots allocated to the store return isStoreInMapSide() ? clusterStatus.getMaxMapTasks() : clusterStatus.getMaxReduceTasks(); } public Configuration getJobConfiguration() { return this.job.getConfiguration(); } /** * Is our Dynamo store in the Map side (versus the Reduce side) of the Job? * @return */ boolean isStoreInMapSide() { return getNumReduceTasks() == 0; } int getNumReduceTasks() { return this.job.getNumReduceTasks(); } int getNumMapTasks() { return this.job.getConfiguration().getInt("mapred.map.tasks", 1); } ClusterStatus getClusterStatus() throws IOException { JobConf jobConf = new JobConf(this.job.getConfiguration()); JobClient jobClient = new JobClient(jobConf); return jobClient.getClusterStatus(false); } } }