com.addthis.hydra.job.alert.JobAlertRunner.java Source code

Java tutorial

Introduction

Here is the source code for com.addthis.hydra.job.alert.JobAlertRunner.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.addthis.hydra.job.alert;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import java.io.IOException;

import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;

import java.text.DecimalFormat;
import java.text.SimpleDateFormat;

import com.addthis.basis.net.HttpUtil;
import com.addthis.basis.net.http.HttpResponse;

import com.addthis.codec.jackson.Jackson;
import com.addthis.codec.json.CodecJSON;
import com.addthis.hydra.job.Job;
import com.addthis.hydra.job.JobState;
import com.addthis.hydra.job.JobTask;
import com.addthis.hydra.job.JobTaskState;
import com.addthis.hydra.job.spawn.Spawn;
import com.addthis.hydra.job.spawn.SpawnMesh;
import com.addthis.hydra.job.store.SpawnDataStore;
import com.addthis.hydra.util.EmailUtil;
import com.addthis.maljson.JSONArray;
import com.addthis.maljson.JSONObject;
import com.addthis.meshy.MeshyClient;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.MapDifference;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimaps;
import com.google.common.collect.SetMultimap;

import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.typesafe.config.ConfigFactory;

import org.apache.commons.lang3.StringUtils;
import org.apache.http.entity.ContentType;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static com.addthis.hydra.job.store.SpawnDataStoreKeys.SPAWN_COMMON_ALERT_PATH;

/**
 * This class runs over the set of job alerts, sending trigger/clear emails as appropriate
 */
public class JobAlertRunner {

    private static final Logger log = LoggerFactory.getLogger(JobAlertRunner.class);
    private static final String clusterHead = ConfigFactory.load()
            .getString("com.addthis.hydra.job.spawn.Spawn.httpHost");

    private static final ObjectMapper objectMapper = new ObjectMapper();

    private static final String meshHost = SpawnMesh.getMeshHost();
    private static final int meshPort = SpawnMesh.getMeshPort();

    private static final long GIGA_BYTE = (long) Math.pow(1024, 3);
    private static final SimpleDateFormat dateFormat = new SimpleDateFormat("yyMMdd-HHmm");
    private static final DecimalFormat decimalFormat = new DecimalFormat("#.###");

    private final Spawn spawn;
    private final SpawnDataStore spawnDataStore;
    private final ConcurrentHashMap<String, AbstractJobAlert> alertMap;

    /**
     * A mapping from (jobIds + aliases) to a set of alertIds.
     * Does not dereference aliases into their corresponding jobIds.
     */
    private final SetMultimap<String, String> jobToAlertsMap = Multimaps
            .synchronizedSetMultimap(HashMultimap.create());

    private MeshyClient meshyClient;
    private boolean alertsEnabled;
    private volatile boolean lastAlertScanFailed;

    public JobAlertRunner(Spawn spawn) {
        this.spawn = spawn;
        this.spawnDataStore = spawn.getSpawnDataStore();
        try {
            meshyClient = new MeshyClient(meshHost, meshPort);
        } catch (IOException e) {
            log.warn("Warning: failed to instantiate job alert mesh client", e);
            meshyClient = null;
        }
        String alertsEnabledString = null;
        try {
            alertsEnabledString = spawnDataStore.get(SPAWN_COMMON_ALERT_PATH);
        } catch (Exception e) {
            log.warn("Unable to read alerts status due to: {}", e.getMessage());
        }
        this.alertsEnabled = (alertsEnabledString == null) || alertsEnabledString.isEmpty()
                || "true".equals(alertsEnabledString);
        this.alertMap = new ConcurrentHashMap<>();
        loadAlertMap();
    }

    /** Disables alert scanning */
    public void disableAlerts() throws Exception {
        spawnDataStore.put(SPAWN_COMMON_ALERT_PATH, "false");
        this.alertsEnabled = false;
    }

    /** Enables alert scanning */
    public void enableAlerts() throws Exception {
        spawnDataStore.put(SPAWN_COMMON_ALERT_PATH, "true");
        this.alertsEnabled = true;
    }

    public boolean isAlertsEnabled() {
        return alertsEnabled;
    }

    public boolean isLastAlertScanFailed() {
        return lastAlertScanFailed;
    }

    /**
     * Iterate over alert map, checking the status of each alert and sending emails as needed.
     */
    public void scanAlerts() {
        if (alertsEnabled) {
            log.info("Started alert scan of {} alerts...", alertMap.size());
            try {
                for (Map.Entry<String, AbstractJobAlert> entry : alertMap.entrySet()) {
                    AbstractJobAlert oldAlert = entry.getValue();
                    Map<String, String> currentErrors = oldAlert.getActiveJobs();
                    // entry may be concurrently deleted, so only recompute if still present, and while locked
                    AbstractJobAlert alert = alertMap.computeIfPresent(entry.getKey(), (id, currentAlert) -> {
                        currentAlert.checkAlertForJobs(spawn, meshyClient);
                        if (!currentAlert.getActiveJobs().equals(currentErrors)) {
                            storeAlert(currentAlert.alertId, currentAlert);
                        }
                        return currentAlert;
                    });
                    // null if it was concurrently removed from the map. Does not catch all removals, but might as well
                    // make a best effort attempt to send clears when convenient (should probably move clear emails to
                    // the removal method at some point)
                    if (alert == null) {
                        sendAlert(oldAlert, "[CLEAR] ", currentErrors);
                    } else {
                        Map<String, String> newErrors = alert.getActiveJobs();
                        MapDifference<String, String> difference = Maps.difference(currentErrors, newErrors);
                        sendAlert(oldAlert, "[CLEAR] ", difference.entriesOnlyOnLeft());
                        sendAlert(alert, "[TRIGGER] ", difference.entriesOnlyOnRight());
                        Map<String, String> errorsChanged = new HashMap<>();
                        for (Map.Entry<String, MapDifference.ValueDifference<String>> differing : difference
                                .entriesDiffering().entrySet()) {
                            String oldValue = differing.getValue().leftValue();
                            String newValue = differing.getValue().rightValue();
                            if (!alert.suppressChanges.suppress(oldValue, newValue)) {
                                errorsChanged.put(differing.getKey(), newValue);
                            }
                        }
                        sendAlert(alert, "[ERROR CHANGED] ", errorsChanged);
                    }
                }
                lastAlertScanFailed = false;
                log.info("Finished alert scan");
            } catch (Exception e) {
                lastAlertScanFailed = true;
                log.error("Unexpected error while scanning alerts: {}", e.getMessage(), e);
            }
        }
    }

    private static String emailSummary(Job job) {
        long files = 0;
        double bytes = 0;
        int running = 0;
        int errored = 0;
        int done = 0;
        int numNodes = 0;

        StringBuffer sb = new StringBuffer();

        if (job != null) {

            List<JobTask> jobNodes = job.getCopyOfTasks();

            if (jobNodes != null) {
                numNodes = jobNodes.size();
                for (JobTask task : jobNodes) {
                    files += task.getFileCount();
                    bytes += task.getByteCount();

                    if (!task.getState().equals(JobTaskState.IDLE)) {
                        running++;
                    }
                    switch (task.getState()) {
                    case IDLE:
                        done++;
                        break;
                    case ERROR:
                        done++;
                        errored++;
                        break;
                    default:
                        break;
                    }
                }
            }
            sb.append("Cluster : " + clusterHead + "\n");
            sb.append("Job : " + job.getId() + "\n");
            sb.append("Job Link : http://" + clusterHead + ":5052/spawn2/index.html#jobs/" + job.getId()
                    + "/tasks\n");
            sb.append("Description : " + job.getDescription() + "\n");
            sb.append("------------------------------ \n");
            sb.append("Task Summary \n");
            sb.append("------------------------------ \n");
            sb.append("Job State : " + job.getState() + "\n");
            sb.append("Start Time : " + format(job.getStartTime()) + "\n");
            sb.append("End Time : " + format(job.getEndTime()) + "\n");
            sb.append("Num Nodes : " + numNodes + "\n");
            sb.append("Running Nodes : " + running + "\n");
            sb.append("Errored Nodes : " + errored + "\n");
            sb.append("Done Nodes : " + done + "\n");
            sb.append("Task files : " + files + "\n");
            sb.append("Task Bytes : " + format(bytes) + " GB\n");
            sb.append("------------------------------ \n");
        }
        return sb.toString();
    }

    private static String format(double bytes) {
        double gb = bytes / GIGA_BYTE;

        return decimalFormat.format(gb);
    }

    private static String format(Long time) {
        if (time != null) {
            return dateFormat.format(new Date(time));
        } else {
            return "-";
        }
    }

    private void sendAlert(AbstractJobAlert jobAlert, String reason, Map<String, String> errors) {
        if (errors.isEmpty()) {
            return;
        }

        String alertLink = String.format("http://%s:5052/spawn2/index.html#alerts/%s", clusterHead,
                jobAlert.alertId);
        log.info("Alerting {} :: jobs : {} : {}", jobAlert.alertId, errors.keySet(), reason);
        if (StringUtils.isNotBlank(jobAlert.email)) {
            sendEmailAlert(jobAlert, alertLink, reason, errors);
        }
        if (StringUtils.isNotBlank(jobAlert.webhookURL)) {
            sendWebhookAlert(jobAlert, alertLink, reason, errors);
        }
    }

    @VisibleForTesting
    static AlertWebhookRequest getWebhookObject(Spawn spawn, AbstractJobAlert jobAlert, String alertLink,
            String reason, Map<String, String> errors) {

        // Turn all the jobs in error into a list of information about each job

        AlertWebhookRequest webhookRequest = new AlertWebhookRequest();
        webhookRequest.setAlertType(jobAlert.getTypeString());
        webhookRequest.setAlertLink(alertLink);
        webhookRequest.setAlertReason(reason.trim());
        webhookRequest.setAlertDescription(jobAlert.description);

        errors.forEach((jobUUID, errMsg) -> {

            JobError jobError = new JobError();
            jobError.setId(jobUUID);
            jobError.setError(errMsg);
            jobError.setClusterHead(clusterHead);

            Job job = spawn.getJob(jobUUID);

            if (job != null) {
                jobError.setJobState(job.getState());
                jobError.setDescription(job.getDescription());

                if (job.getStartTime() != null) {
                    jobError.setStartTime(job.getStartTime());
                }

                if (job.getEndTime() != null) {
                    jobError.setEndTime(job.getEndTime());
                }

                List<JobTask> jobTasks = job.getCopyOfTasks();
                jobError.setNodeCount(jobTasks.size());
                jobError.setErrorCount(
                        (int) jobTasks.stream().filter(t -> t.getState() == JobTaskState.ERROR).count());
            }

            webhookRequest.getJobsInError().add(jobError);
        });

        return webhookRequest;
    }

    private void sendWebhookAlert(AbstractJobAlert jobAlert, String alertLink, String reason,
            Map<String, String> errors) {

        try {
            byte[] body = objectMapper
                    .writeValueAsBytes(getWebhookObject(spawn, jobAlert, alertLink, reason, errors));
            HttpResponse response = HttpUtil.httpPost(jobAlert.webhookURL,
                    ContentType.APPLICATION_JSON.getMimeType(), body, 5_000);
            if (response.getStatus() >= 300) {
                log.error("non-200 status code received for webhook alert for alert {}", jobAlert.alertId);
            }
        } catch (IOException ex) {
            log.error("unable to send webhook alert for alert {}", jobAlert.alertId, ex);
        }
    }

    /**
     * Send an email when an alert fires or clears.
     *
     * @param jobAlert The alert to modify
     */
    private void sendEmailAlert(AbstractJobAlert jobAlert, String alertLink, String reason,
            Map<String, String> errors) {

        String description = jobAlert.description;
        boolean blankDescription = StringUtils.isBlank(description);
        final String shortDescription;
        if (blankDescription) {
            shortDescription = errors.keySet().toString();
        } else {
            shortDescription = description.split("\n")[0];
        }
        String subject = String.format("%s %s - %s", reason, jobAlert.getTypeString(), shortDescription);
        StringBuilder sb = new StringBuilder(reason + ' ' + jobAlert.getTypeString() + '\n');
        sb.append("Alert link : ").append(alertLink).append('\n');

        if (!blankDescription) {
            sb.append("Alert Description : ").append(description).append('\n');
        }

        for (Map.Entry<String, String> entry : errors.entrySet()) {
            sb.append(emailSummary(spawn.getJob(entry.getKey()))).append('\n');
            sb.append("Error Message\n");
            sb.append(entry.getValue());
            sb.append("\n------------------------------\n");
        }
        if (!EmailUtil.email(jobAlert.email, subject, sb.toString())) {
            log.error("Unable to send email for alert {}", jobAlert.alertId);
        }
    }

    private void loadAlertMap() {
        Map<String, String> alertsRaw = spawnDataStore.getAllChildren(SPAWN_COMMON_ALERT_PATH);
        for (Map.Entry<String, String> entry : alertsRaw.entrySet()) {
            // Underscores are used to mark meta-information (for now, whether we have loaded legacy alerts.)
            if (!entry.getKey().startsWith("_")) {
                loadAlert(entry.getKey(), entry.getValue());
            }
        }
        log.info("{} alerts loaded", alertMap.size());
    }

    private void loadAlert(String id, String raw) {
        try {
            AbstractJobAlert jobAlert = CodecJSON.decodeString(AbstractJobAlert.class, raw);
            alertMap.put(id, jobAlert);
            updateJobToAlertsMap(id, null, jobAlert);
        } catch (Exception ex) {
            log.error("Failed to decode JobAlert id={} raw={}", id, raw, ex);
        }
    }

    /**
     * Remove any outdated mappings from a (job + alias) to an alert and
     * insert new mappings. If {@code old} is null then do not remove
     * any mappings. If {@code alert} is null then do not insert any mappings.
     *
     * @param id    alertId
     * @param old   if non-null then remove associations
     * @param alert if non-null then insert associations
     */
    private void updateJobToAlertsMap(@Nonnull String id, @Nullable AbstractJobAlert old,
            @Nullable AbstractJobAlert alert) {
        if (old != null) {
            for (String jobId : old.jobIds) {
                jobToAlertsMap.remove(jobId, id);
            }
        }
        if (alert != null) {
            for (String jobId : alert.jobIds) {
                jobToAlertsMap.put(jobId, id);
            }
        }
    }

    public void putAlert(String id, AbstractJobAlert alert) {
        alertMap.compute(id, (key, old) -> {
            if (old != null) {
                alert.setStateFrom(old);
            }
            updateJobToAlertsMap(id, old, alert);
            storeAlert(id, alert);
            return alert;
        });
    }

    public void removeAlert(String id) {
        if (id != null) {
            alertMap.computeIfPresent(id, (key, value) -> {
                updateJobToAlertsMap(id, value, null);
                storeAlert(id, null);
                return null;
            });
        }
    }

    private void storeAlert(String alertId, @Nullable AbstractJobAlert alert) {
        try {
            if (alert != null) {
                spawnDataStore.putAsChild(SPAWN_COMMON_ALERT_PATH, alertId, CodecJSON.encodeString(alert));
            } else {
                spawnDataStore.deleteChild(SPAWN_COMMON_ALERT_PATH, alertId);
            }
        } catch (Exception e) {
            log.warn("Warning: failed to save alert id={} alert={}", alertId, alert);
        }
    }

    /**
     * Get a snapshot of the alert map as an array, mainly for rendering in the UI.
     *
     * @return A JSONObject representation of all existing alerts
     */
    public JSONArray getAlertStateArray() {
        JSONArray rv = new JSONArray();
        for (AbstractJobAlert jobAlert : alertMap.values()) {
            try {
                rv.put(jobAlert.toJSON());
            } catch (Exception e) {
                log.warn("Warning: failed to send alert in array: {}", jobAlert);
            }
        }
        return rv;
    }

    public JSONObject getAlertStateMap() {
        JSONObject rv = new JSONObject();
        for (AbstractJobAlert jobAlert : alertMap.values()) {
            try {
                rv.put(jobAlert.alertId, jobAlert.toJSON());
            } catch (Exception e) {
                log.warn("Warning: failed to send alert in map: {}", jobAlert);
            }
        }
        return rv;
    }

    public String getAlert(String alertId) {
        try {
            AbstractJobAlert alert = alertMap.get(alertId);
            if (alert == null) {
                return null;
            } else {
                return alert.toJSON().toString();
            }
        } catch (Exception e) {
            log.warn("Failed to fetch alert {}", alertId, e);
            return null;
        }
    }

    /** Copy and then modify an alert by removing a specific job id. */
    private AbstractJobAlert copyWithoutJobId(@Nonnull String jobId, AbstractJobAlert old) {
        ObjectNode json = Jackson.defaultMapper().valueToTree(old);
        ArrayNode jsonArray = json.putArray("jobIds");
        old.jobIds.stream().filter(x -> !x.equals(jobId)).forEach(jsonArray::add);
        try {
            return Jackson.defaultMapper().treeToValue(json, AbstractJobAlert.class);
        } catch (IOException ex) {
            log.error("Internal error removing job alerts:", ex);
            return old;
        }
    }

    /**
     * Remove {@code jobId} from all alerts that are monitoring it. Delete alerts that are only monitoring this job.
     */
    public void removeAlertsForJob(String jobId) {
        Set<String> alertIds = ImmutableSet.copyOf(jobToAlertsMap.get(jobId));
        for (String mappedAlertId : alertIds) {
            if (alertMap.computeIfPresent(mappedAlertId, (alertId, alert) -> {
                ImmutableList<String> jobIds = alert.jobIds;
                if (jobIds.contains(jobId)) {
                    @Nullable
                    AbstractJobAlert newAlert;
                    if (jobIds.size() == 1) {
                        newAlert = null;
                    } else {
                        newAlert = copyWithoutJobId(jobId, alert);
                    }
                    updateJobToAlertsMap(alertId, alert, newAlert);
                    storeAlert(alertId, newAlert);
                    return newAlert;
                } else {
                    log.warn("jobToAlertsMap has mapping from job {} to alert {} but alert has no reference to job",
                            jobId, alertId);
                    return alert;
                }
            }) == null) {
                log.warn("jobToAlertsMap has mapping from job {} to alert {} but alert does not exist", jobId,
                        mappedAlertId);
            }
        }
    }

    /**
     * Returns alerts for the given job id. Does not look up aliases for a job id. If job id is an alias, will
     * return any alerts that are configured on the alias, but will not look up alerts on the actual job id.
     */
    public Set<AbstractJobAlert> getAlertsForJob(String jobId) {
        Set<String> alertIds = ImmutableSet.copyOf(jobToAlertsMap.get(jobId));
        return alertIds.stream().map(alertMap::get).collect(Collectors.toSet());
    }

    @VisibleForTesting
    static class AlertWebhookRequest {

        private String alertDescription;
        private String alertLink;
        private String alertType;
        private String alertReason;
        private List<JobError> jobsInError = Lists.newArrayList();

        public AlertWebhookRequest() {

        }

        @JsonProperty("alert_description")
        public String getAlertDescription() {
            return alertDescription;
        }

        public void setAlertDescription(String alertDescription) {
            this.alertDescription = alertDescription;
        }

        @JsonProperty("alert_link")
        public String getAlertLink() {
            return alertLink;
        }

        public void setAlertLink(String alertLink) {
            this.alertLink = alertLink;
        }

        @JsonProperty("alert_type")
        public String getAlertType() {
            return alertType;
        }

        public void setAlertType(String alertType) {
            this.alertType = alertType;
        }

        @JsonProperty("alert_reason")
        public String getAlertReason() {
            return alertReason;
        }

        public void setAlertReason(String alertReason) {
            this.alertReason = alertReason;
        }

        @JsonProperty("jobs_in_error")
        public List<JobError> getJobsInError() {
            return jobsInError;
        }

        public void setJobsInError(List<JobError> jobsInError) {
            this.jobsInError = jobsInError;
        }
    }

    @VisibleForTesting
    static class JobError {

        private String id;
        private String description;
        private String clusterHead;
        private String error;
        private JobState jobState;
        private long startTime;
        private long endTime;
        private int nodeCount;
        private int errorCount;

        public JobError() {

        }

        public String getId() {
            return id;
        }

        public void setId(String id) {
            this.id = id;
        }

        public String getDescription() {
            return description;
        }

        public void setDescription(String description) {
            this.description = description;
        }

        @JsonProperty("cluster_head")
        public String getClusterHead() {
            return clusterHead;
        }

        public void setClusterHead(String clusterHead) {
            this.clusterHead = clusterHead;
        }

        public String getError() {
            return error;
        }

        public void setError(String error) {
            this.error = error;
        }

        @JsonProperty("job_state")
        public JobState getJobState() {
            return jobState;
        }

        public void setJobState(JobState jobState) {
            this.jobState = jobState;
        }

        @JsonProperty("start_time")
        public long getStartTime() {
            return startTime;
        }

        public void setStartTime(long startTime) {
            this.startTime = startTime;
        }

        @JsonProperty("end_time")
        public long getEndTime() {
            return endTime;
        }

        public void setEndTime(long endTime) {
            this.endTime = endTime;
        }

        @JsonProperty("node_count")
        public int getNodeCount() {
            return nodeCount;
        }

        public void setNodeCount(int nodeCount) {
            this.nodeCount = nodeCount;
        }

        @JsonProperty("error_count")
        public int getErrorCount() {
            return errorCount;
        }

        public void setErrorCount(int errorCount) {
            this.errorCount = errorCount;
        }
    }
}