Java tutorial
/* * * Copyright 2013 Netflix, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.netflix.bdp.s3mper.alert.impl; import com.netflix.bdp.s3mper.alert.AlertDispatcher; import com.amazonaws.auth.BasicAWSCredentials; import com.amazonaws.services.cloudwatch.AmazonCloudWatchAsyncClient; import com.amazonaws.services.cloudwatch.model.MetricDatum; import com.amazonaws.services.cloudwatch.model.PutMetricDataRequest; import com.amazonaws.services.cloudwatch.model.StandardUnit; import com.amazonaws.services.sqs.AmazonSQSClient; import com.amazonaws.services.sqs.model.GetQueueUrlRequest; import com.amazonaws.services.sqs.model.SendMessageRequest; import com.netflix.bdp.s3mper.alert.impl.AbstractMessage.QueryType; import java.io.IOException; import java.net.InetAddress; import java.net.URI; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.security.UserGroupInformation; import org.apache.log4j.Logger; import org.codehaus.jackson.map.ObjectMapper; /** * Dispatches CloudWatch Metrics and SQS Messages on the event of consistency * failure or timeout. * * @author dweeks */ public class CloudWatchAlertDispatcher implements AlertDispatcher { private static final Logger log = Logger.getLogger(CloudWatchAlertDispatcher.class.getName()); private AmazonCloudWatchAsyncClient cloudWatch; private AmazonSQSClient sqs; private String namespace = "netflix"; private String cloudWatchConsistencyMetric = "com.netflix.bdp.s3mper.consistency.failures"; private String cloudWatchTimeoutMetric = "com.netflix.bdp.s3mper.consistency.timeout"; private String consistencyQueue = "s3mper-alert-queue"; private String timeoutQueue = "s3mper-timeout-queue"; private String notificationQueue = "s3mper-notification-queue"; private String consistencyQueueUrl = ""; private String timeoutQueueUrl = ""; private String notificationQueueUrl = ""; private boolean reportingDisabled = false; private URI uri; private Configuration conf; private int pathReportLimit = 10; private int traceDepth = 15; @Override public void init(URI uri, Configuration conf) { this.uri = uri; this.conf = conf; } /** * Don't initialize the SQS queues unless we actually need to send a message. */ private void lazyInit() { String keyId = conf.get("fs." + uri.getScheme() + ".awsAccessKeyId"); String keySecret = conf.get("fs." + uri.getScheme() + ".awsSecretAccessKey"); //An override option for accessing across accounts keyId = conf.get("fs." + uri.getScheme() + ".override.awsAccessKeyId", keyId); keySecret = conf.get("fs." + uri.getScheme() + ".override.awsSecretAccessKey", keySecret); synchronized (this) { if (cloudWatch == null) { initCloudWatch(keyId, keySecret); } if (sqs == null) { initSqs(keyId, keySecret); } } } private void initCloudWatch(String keyId, String keySecret) { log.debug("Initializing CloudWatch Client"); cloudWatch = new AmazonCloudWatchAsyncClient(new BasicAWSCredentials(keyId, keySecret)); } private void initSqs(String keyId, String keySecret) { log.debug("Initializing SQS Client"); sqs = new AmazonSQSClient(new BasicAWSCredentials(keyId, keySecret)); //SQS Consistency Queue consistencyQueue = conf.get("s3mper.alert.sqs.queue", consistencyQueue); consistencyQueueUrl = sqs.getQueueUrl(new GetQueueUrlRequest(consistencyQueue)).getQueueUrl(); //SQS Timeout Queue timeoutQueue = conf.get("s3mper.timeout.sqs.queue", timeoutQueue); timeoutQueueUrl = sqs.getQueueUrl(new GetQueueUrlRequest(timeoutQueue)).getQueueUrl(); //SQS Notification Queue notificationQueue = conf.get("s3mper.notification.sqs.queue", notificationQueue); notificationQueueUrl = sqs.getQueueUrl(new GetQueueUrlRequest(notificationQueue)).getQueueUrl(); //Disable reporting (Testing purposes mostly) reportingDisabled = conf.getBoolean("s3mper.reporting.disabled", reportingDisabled); } /** * Sends an alert detailing that the given paths were missing from a list * operation. * * @param missingPaths */ @Override public void alert(List<Path> missingPaths) { lazyInit(); if (reportingDisabled) { return; } sendCloudWatchConsistencyAlert(); sendSQSConsistencyMessage(missingPaths, false); } private void sendCloudWatchConsistencyAlert() { MetricDatum datum = new MetricDatum(); datum.setMetricName(cloudWatchConsistencyMetric); datum.setUnit(StandardUnit.Count); datum.setValue(1.0); PutMetricDataRequest request = new PutMetricDataRequest(); request.setNamespace(namespace); request.setMetricData(Collections.singleton(datum)); cloudWatch.putMetricData(request); } /** * Sends a message that a listing was initially inconsistent but was * recovered by delaying/retrying. * * @param paths */ @Override public void recovered(List<Path> paths) { lazyInit(); if (reportingDisabled) { return; } sendSQSConsistencyMessage(paths, true); } private void sendSQSConsistencyMessage(List<Path> paths, boolean recovered) { S3ConsistencyMessage message = new S3ConsistencyMessage(); buildMessage(message); List<String> pathStrings = new ArrayList<String>(); boolean truncated = false; for (Path p : paths) { pathStrings.add(p.toUri().toString()); //Truncate if the message payload gets to be too large (i.e. to many missing files) if (pathStrings.size() >= pathReportLimit) { truncated = true; break; } } message.setPaths(pathStrings); message.setTruncated(truncated); int missingFiles = paths.size(); if (recovered) { missingFiles = 0; } message.setMissingFiles(missingFiles); message.setRecovered(recovered); sendMessage(consistencyQueueUrl, message); if (!recovered) { sendMessage(notificationQueueUrl, message); } } /** * Sends a message to the timeout queue indicating that a dynamodb operation * timedout. * * @param operation * @param paths */ @Override public void timeout(String operation, List<Path> paths) { lazyInit(); if (reportingDisabled) { return; } //TODO: Being over-protective about these timeout messages. try { sendCloudWatchTimeoutAlert(); } catch (Exception e) { log.error("Failed to send cloud watch timeout alert.", e); } try { sendSQSTimeoutMessage(operation); } catch (Exception e) { log.error("Filed to send SQS timeout message.", e); } } private void sendCloudWatchTimeoutAlert() { MetricDatum datum = new MetricDatum(); datum.setMetricName(cloudWatchTimeoutMetric); datum.setUnit(StandardUnit.Count); datum.setValue(1.0); PutMetricDataRequest request = new PutMetricDataRequest(); request.setNamespace(namespace); request.setMetricData(Collections.singleton(datum)); cloudWatch.putMetricData(request); } private void sendSQSTimeoutMessage(String operation) { S3mperTimeoutMessage message = new S3mperTimeoutMessage(); buildMessage(message); message.setOperation(operation); sendMessage(timeoutQueueUrl, message); } private void buildMessage(AbstractMessage message) { String hostname = "unknown"; try { hostname = InetAddress.getLocalHost().getHostName(); } catch (UnknownHostException e) { log.warn("Failed to identify hostname", e); } message.setEpoch(System.currentTimeMillis()); message.setTimestamp(new Date(message.getEpoch()).toString()); message.setHostname(hostname); String username = conf.get("user.name", System.getProperty("user.name")); try { username = UserGroupInformation.getCurrentUser().getUserName(); } catch (IOException e) { log.warn("Failed to identify user using hadoop library.", e); } message.setUsername(username); message.setGenieId(conf.get("genie.job.id")); message.setDataovenId(conf.get("dataoven.job.id")); String queryId = conf.get("hive.query.id"); QueryType queryType = QueryType.Unknown; if (queryId != null) { queryType = QueryType.Hive; message.setLogFile(conf.get("hive.log.file")); } else { queryId = conf.get("pig.script.id"); if (queryId != null) { queryType = QueryType.Pig; message.setLogFile(conf.get("pig.logfile")); } } message.setQueryId(queryId); message.setQueryType(queryType); message.setJobId(conf.get("mapred.job.id")); message.setTaskId(conf.get("mapred.tip.id")); message.setAttemptId(conf.get("mapred.task.id")); message.setInputFile(conf.get("mapred.input.file")); message.setEmail(conf.get("s3mper.email")); try { //We have to guess at this since it may not be explicitly in the config if (message.getJobId() == null) { String[] split = conf.get("mapreduce.job.dir").split("/"); String jobId = split[split.length - 1]; message.setJobId(jobId); } } catch (RuntimeException e) { log.debug("Failed to determine job id"); } try { StackTraceElement[] stack = Thread.currentThread().getStackTrace(); List<String> stackTrace = new ArrayList<String>(traceDepth); for (int i = 0; i < traceDepth && i < stack.length; i++) { stackTrace.add(stack[i].toString()); } message.setStackTrace(stackTrace); } catch (Exception e) { log.debug("Stacktrace generation failed", e); } } private void sendMessage(String url, AbstractMessage message) { SendMessageRequest sqsRequest = null; try { String payload = new ObjectMapper().writeValueAsString(message); if (log.isDebugEnabled()) { log.debug("Sending SQS: " + payload); } sqsRequest = new SendMessageRequest(url, payload); } catch (IOException e) { log.error("Failed to map json object.", e); } sqs.sendMessage(sqsRequest); } @Override public void setConfig(Configuration conf) { this.conf = conf; } public void setUri(URI uri) { this.uri = uri; } }