com.netflix.bdp.s3mper.alert.impl.CloudWatchAlertDispatcher.java Source code

Java tutorial

Introduction

Here is the source code for com.netflix.bdp.s3mper.alert.impl.CloudWatchAlertDispatcher.java

Source

/*
 *
 *  Copyright 2013 Netflix, Inc.
 *
 *     Licensed under the Apache License, Version 2.0 (the "License");
 *     you may not use this file except in compliance with the License.
 *     You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 *     Unless required by applicable law or agreed to in writing, software
 *     distributed under the License is distributed on an "AS IS" BASIS,
 *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *     See the License for the specific language governing permissions and
 *     limitations under the License.
 *
 */

package com.netflix.bdp.s3mper.alert.impl;

import com.netflix.bdp.s3mper.alert.AlertDispatcher;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.cloudwatch.AmazonCloudWatchAsyncClient;
import com.amazonaws.services.cloudwatch.model.MetricDatum;
import com.amazonaws.services.cloudwatch.model.PutMetricDataRequest;
import com.amazonaws.services.cloudwatch.model.StandardUnit;
import com.amazonaws.services.sqs.AmazonSQSClient;
import com.amazonaws.services.sqs.model.GetQueueUrlRequest;
import com.amazonaws.services.sqs.model.SendMessageRequest;
import com.netflix.bdp.s3mper.alert.impl.AbstractMessage.QueryType;
import java.io.IOException;
import java.net.InetAddress;
import java.net.URI;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.log4j.Logger;
import org.codehaus.jackson.map.ObjectMapper;

/**
 * Dispatches CloudWatch Metrics and SQS Messages on the event of consistency
 * failure or timeout.
 * 
 * @author dweeks
 */
public class CloudWatchAlertDispatcher implements AlertDispatcher {
    private static final Logger log = Logger.getLogger(CloudWatchAlertDispatcher.class.getName());

    private AmazonCloudWatchAsyncClient cloudWatch;
    private AmazonSQSClient sqs;

    private String namespace = "netflix";
    private String cloudWatchConsistencyMetric = "com.netflix.bdp.s3mper.consistency.failures";
    private String cloudWatchTimeoutMetric = "com.netflix.bdp.s3mper.consistency.timeout";

    private String consistencyQueue = "s3mper-alert-queue";
    private String timeoutQueue = "s3mper-timeout-queue";
    private String notificationQueue = "s3mper-notification-queue";

    private String consistencyQueueUrl = "";
    private String timeoutQueueUrl = "";
    private String notificationQueueUrl = "";

    private boolean reportingDisabled = false;

    private URI uri;
    private Configuration conf;

    private int pathReportLimit = 10;
    private int traceDepth = 15;

    @Override
    public void init(URI uri, Configuration conf) {
        this.uri = uri;
        this.conf = conf;
    }

    /**
     * Don't initialize the SQS queues unless we actually need to send a message.
     */
    private void lazyInit() {
        String keyId = conf.get("fs." + uri.getScheme() + ".awsAccessKeyId");
        String keySecret = conf.get("fs." + uri.getScheme() + ".awsSecretAccessKey");

        //An override option for accessing across accounts
        keyId = conf.get("fs." + uri.getScheme() + ".override.awsAccessKeyId", keyId);
        keySecret = conf.get("fs." + uri.getScheme() + ".override.awsSecretAccessKey", keySecret);

        synchronized (this) {
            if (cloudWatch == null) {
                initCloudWatch(keyId, keySecret);
            }

            if (sqs == null) {
                initSqs(keyId, keySecret);
            }
        }

    }

    private void initCloudWatch(String keyId, String keySecret) {
        log.debug("Initializing CloudWatch Client");
        cloudWatch = new AmazonCloudWatchAsyncClient(new BasicAWSCredentials(keyId, keySecret));
    }

    private void initSqs(String keyId, String keySecret) {
        log.debug("Initializing SQS Client");
        sqs = new AmazonSQSClient(new BasicAWSCredentials(keyId, keySecret));

        //SQS Consistency Queue
        consistencyQueue = conf.get("s3mper.alert.sqs.queue", consistencyQueue);
        consistencyQueueUrl = sqs.getQueueUrl(new GetQueueUrlRequest(consistencyQueue)).getQueueUrl();

        //SQS Timeout Queue
        timeoutQueue = conf.get("s3mper.timeout.sqs.queue", timeoutQueue);
        timeoutQueueUrl = sqs.getQueueUrl(new GetQueueUrlRequest(timeoutQueue)).getQueueUrl();

        //SQS Notification Queue
        notificationQueue = conf.get("s3mper.notification.sqs.queue", notificationQueue);
        notificationQueueUrl = sqs.getQueueUrl(new GetQueueUrlRequest(notificationQueue)).getQueueUrl();

        //Disable reporting  (Testing purposes mostly)
        reportingDisabled = conf.getBoolean("s3mper.reporting.disabled", reportingDisabled);
    }

    /**
     * Sends an alert detailing that the given paths were missing from a list
     * operation.
     * 
     * @param missingPaths 
     */
    @Override
    public void alert(List<Path> missingPaths) {
        lazyInit();

        if (reportingDisabled) {
            return;
        }

        sendCloudWatchConsistencyAlert();
        sendSQSConsistencyMessage(missingPaths, false);
    }

    private void sendCloudWatchConsistencyAlert() {
        MetricDatum datum = new MetricDatum();
        datum.setMetricName(cloudWatchConsistencyMetric);
        datum.setUnit(StandardUnit.Count);
        datum.setValue(1.0);

        PutMetricDataRequest request = new PutMetricDataRequest();
        request.setNamespace(namespace);
        request.setMetricData(Collections.singleton(datum));

        cloudWatch.putMetricData(request);
    }

    /**
     * Sends a message that a listing was initially inconsistent but was
     * recovered by delaying/retrying.
     * 
     * @param paths 
     */
    @Override
    public void recovered(List<Path> paths) {
        lazyInit();

        if (reportingDisabled) {
            return;
        }

        sendSQSConsistencyMessage(paths, true);
    }

    private void sendSQSConsistencyMessage(List<Path> paths, boolean recovered) {
        S3ConsistencyMessage message = new S3ConsistencyMessage();

        buildMessage(message);

        List<String> pathStrings = new ArrayList<String>();
        boolean truncated = false;

        for (Path p : paths) {
            pathStrings.add(p.toUri().toString());

            //Truncate if the message payload gets to be too large (i.e. to many missing files)
            if (pathStrings.size() >= pathReportLimit) {
                truncated = true;
                break;
            }
        }

        message.setPaths(pathStrings);
        message.setTruncated(truncated);

        int missingFiles = paths.size();

        if (recovered) {
            missingFiles = 0;
        }

        message.setMissingFiles(missingFiles);
        message.setRecovered(recovered);

        sendMessage(consistencyQueueUrl, message);

        if (!recovered) {
            sendMessage(notificationQueueUrl, message);
        }
    }

    /**
     * Sends a message to the timeout queue indicating that a dynamodb operation
     * timedout.
     * 
     * @param operation
     * @param paths 
     */
    @Override
    public void timeout(String operation, List<Path> paths) {
        lazyInit();

        if (reportingDisabled) {
            return;
        }

        //TODO: Being over-protective about these timeout messages.
        try {
            sendCloudWatchTimeoutAlert();
        } catch (Exception e) {
            log.error("Failed to send cloud watch timeout alert.", e);
        }

        try {
            sendSQSTimeoutMessage(operation);
        } catch (Exception e) {
            log.error("Filed to send SQS timeout message.", e);
        }
    }

    private void sendCloudWatchTimeoutAlert() {
        MetricDatum datum = new MetricDatum();
        datum.setMetricName(cloudWatchTimeoutMetric);
        datum.setUnit(StandardUnit.Count);
        datum.setValue(1.0);

        PutMetricDataRequest request = new PutMetricDataRequest();
        request.setNamespace(namespace);
        request.setMetricData(Collections.singleton(datum));

        cloudWatch.putMetricData(request);
    }

    private void sendSQSTimeoutMessage(String operation) {
        S3mperTimeoutMessage message = new S3mperTimeoutMessage();
        buildMessage(message);

        message.setOperation(operation);

        sendMessage(timeoutQueueUrl, message);
    }

    private void buildMessage(AbstractMessage message) {
        String hostname = "unknown";

        try {
            hostname = InetAddress.getLocalHost().getHostName();
        } catch (UnknownHostException e) {
            log.warn("Failed to identify hostname", e);
        }

        message.setEpoch(System.currentTimeMillis());
        message.setTimestamp(new Date(message.getEpoch()).toString());
        message.setHostname(hostname);

        String username = conf.get("user.name", System.getProperty("user.name"));

        try {
            username = UserGroupInformation.getCurrentUser().getUserName();
        } catch (IOException e) {
            log.warn("Failed to identify user using hadoop library.", e);
        }

        message.setUsername(username);

        message.setGenieId(conf.get("genie.job.id"));
        message.setDataovenId(conf.get("dataoven.job.id"));
        String queryId = conf.get("hive.query.id");

        QueryType queryType = QueryType.Unknown;

        if (queryId != null) {
            queryType = QueryType.Hive;
            message.setLogFile(conf.get("hive.log.file"));
        } else {
            queryId = conf.get("pig.script.id");

            if (queryId != null) {
                queryType = QueryType.Pig;
                message.setLogFile(conf.get("pig.logfile"));
            }
        }

        message.setQueryId(queryId);
        message.setQueryType(queryType);

        message.setJobId(conf.get("mapred.job.id"));
        message.setTaskId(conf.get("mapred.tip.id"));
        message.setAttemptId(conf.get("mapred.task.id"));
        message.setInputFile(conf.get("mapred.input.file"));
        message.setEmail(conf.get("s3mper.email"));

        try {
            //We have to guess at this since it may not be explicitly in the config
            if (message.getJobId() == null) {
                String[] split = conf.get("mapreduce.job.dir").split("/");
                String jobId = split[split.length - 1];

                message.setJobId(jobId);
            }
        } catch (RuntimeException e) {
            log.debug("Failed to determine job id");
        }

        try {
            StackTraceElement[] stack = Thread.currentThread().getStackTrace();

            List<String> stackTrace = new ArrayList<String>(traceDepth);

            for (int i = 0; i < traceDepth && i < stack.length; i++) {
                stackTrace.add(stack[i].toString());
            }

            message.setStackTrace(stackTrace);
        } catch (Exception e) {
            log.debug("Stacktrace generation failed", e);
        }

    }

    private void sendMessage(String url, AbstractMessage message) {
        SendMessageRequest sqsRequest = null;

        try {
            String payload = new ObjectMapper().writeValueAsString(message);

            if (log.isDebugEnabled()) {
                log.debug("Sending SQS: " + payload);
            }

            sqsRequest = new SendMessageRequest(url, payload);
        } catch (IOException e) {
            log.error("Failed to map json object.", e);
        }

        sqs.sendMessage(sqsRequest);
    }

    @Override
    public void setConfig(Configuration conf) {
        this.conf = conf;
    }

    public void setUri(URI uri) {
        this.uri = uri;
    }

}