com.addthis.hydra.job.alert.JobAlertRunner.java Source code

Java tutorial

Introduction

Here is the source code for com.addthis.hydra.job.alert.JobAlertRunner.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.addthis.hydra.job.alert;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import java.io.IOException;

import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;

import java.text.DecimalFormat;
import java.text.SimpleDateFormat;

import com.addthis.basis.util.LessStrings;

import com.addthis.codec.jackson.Jackson;
import com.addthis.codec.json.CodecJSON;
import com.addthis.hydra.job.Job;
import com.addthis.hydra.job.JobTask;
import com.addthis.hydra.job.JobTaskState;
import com.addthis.hydra.job.spawn.Spawn;
import com.addthis.hydra.job.spawn.SpawnMesh;
import com.addthis.hydra.job.store.SpawnDataStore;
import com.addthis.hydra.util.EmailUtil;
import com.addthis.maljson.JSONArray;
import com.addthis.maljson.JSONObject;
import com.addthis.meshy.MeshyClient;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.MapDifference;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimaps;
import com.google.common.collect.SetMultimap;

import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.typesafe.config.ConfigFactory;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static com.addthis.hydra.job.store.SpawnDataStoreKeys.SPAWN_COMMON_ALERT_PATH;

/**
 * This class runs over the set of job alerts, sending trigger/clear emails as appropriate
 */
public class JobAlertRunner {

    private static final Logger log = LoggerFactory.getLogger(JobAlertRunner.class);
    private static final String clusterHead = ConfigFactory.load()
            .getString("com.addthis.hydra.job.spawn.Spawn.httpHost");
    private static final String meshHost = SpawnMesh.getMeshHost();
    private static final int meshPort = SpawnMesh.getMeshPort();

    private static final long GIGA_BYTE = (long) Math.pow(1024, 3);
    private static final SimpleDateFormat dateFormat = new SimpleDateFormat("yyMMdd-HHmm");
    private static final DecimalFormat decimalFormat = new DecimalFormat("#.###");

    private final Spawn spawn;
    private final SpawnDataStore spawnDataStore;
    private final ConcurrentHashMap<String, AbstractJobAlert> alertMap;

    /**
     * A mapping from (jobIds + aliases) to a set of alertIds.
     * Does not dereference aliases into their corresponding jobIds.
     */
    private final SetMultimap<String, String> jobToAlertsMap = Multimaps
            .synchronizedSetMultimap(HashMultimap.create());

    private MeshyClient meshyClient;
    private boolean alertsEnabled;
    private volatile boolean lastAlertScanFailed;

    public JobAlertRunner(Spawn spawn, boolean alertEnabled) {
        this.spawn = spawn;
        this.spawnDataStore = spawn.getSpawnDataStore();
        try {
            meshyClient = new MeshyClient(meshHost, meshPort);
        } catch (IOException e) {
            log.warn("Warning: failed to instantiate job alert mesh client", e);
            meshyClient = null;
        }
        this.alertsEnabled = alertEnabled;
        this.alertMap = new ConcurrentHashMap<>();
        loadAlertMap();
    }

    /** Disables alert scanning */
    public void disableAlerts() {
        this.alertsEnabled = false;
    }

    /** Enables alert scanning */
    public void enableAlerts() {
        this.alertsEnabled = true;
    }

    public boolean isAlertsEnabled() {
        return alertsEnabled;
    }

    public boolean isLastAlertScanFailed() {
        return lastAlertScanFailed;
    }

    /**
     * Iterate over alert map, checking the status of each alert and sending emails as needed.
     */
    public void scanAlerts() {
        if (alertsEnabled) {
            log.info("Started alert scan of {} alerts...", alertMap.size());
            try {
                for (Map.Entry<String, AbstractJobAlert> entry : alertMap.entrySet()) {
                    AbstractJobAlert oldAlert = entry.getValue();
                    Map<String, String> currentErrors = oldAlert.getActiveJobs();
                    // entry may be concurrently deleted, so only recompute if still present, and while locked
                    AbstractJobAlert alert = alertMap.computeIfPresent(entry.getKey(), (id, currentAlert) -> {
                        currentAlert.checkAlertForJobs(currentAlert.getAlertJobs(spawn), meshyClient);
                        if (!currentAlert.getActiveJobs().equals(currentErrors)) {
                            storeAlert(currentAlert.alertId, currentAlert);
                        }
                        return currentAlert;
                    });
                    // null if it was concurrently removed from the map. Does not catch all removals, but might as well
                    // make a best effort attempt to send clears when convenient (should probably move clear emails to
                    // the removal method at some point)
                    if (alert == null) {
                        emailAlert(oldAlert, "[CLEAR] ", currentErrors);
                    } else {
                        Map<String, String> newErrors = alert.getActiveJobs();
                        MapDifference<String, String> difference = Maps.difference(currentErrors, newErrors);
                        emailAlert(oldAlert, "[CLEAR] ", difference.entriesOnlyOnLeft());
                        emailAlert(alert, "[TRIGGER] ", difference.entriesOnlyOnRight());
                        Map<String, String> errorsChanged = new HashMap<>();
                        for (Map.Entry<String, MapDifference.ValueDifference<String>> differing : difference
                                .entriesDiffering().entrySet()) {
                            String oldValue = differing.getValue().leftValue();
                            String newValue = differing.getValue().rightValue();
                            if (!alert.suppressChanges.suppress(oldValue, newValue)) {
                                errorsChanged.put(differing.getKey(), newValue);
                            }
                        }
                        emailAlert(alert, "[ERROR CHANGED] ", errorsChanged);
                    }
                }
                lastAlertScanFailed = false;
                log.info("Finished alert scan");
            } catch (Exception e) {
                lastAlertScanFailed = true;
                log.error("Unexpected error while scanning alerts: {}", e.getMessage(), e);
            }
        }
    }

    private static String summary(Job job) {
        long files = 0;
        double bytes = 0;
        int running = 0;
        int errored = 0;
        int done = 0;
        int numNodes = 0;

        StringBuffer sb = new StringBuffer();

        if (job != null) {

            List<JobTask> jobNodes = job.getCopyOfTasks();

            if (jobNodes != null) {
                numNodes = jobNodes.size();
                for (JobTask task : jobNodes) {
                    files += task.getFileCount();
                    bytes += task.getByteCount();

                    if (!task.getState().equals(JobTaskState.IDLE)) {
                        running++;
                    }
                    switch (task.getState()) {
                    case IDLE:
                        done++;
                        break;
                    case ERROR:
                        done++;
                        errored++;
                        break;
                    default:
                        break;
                    }
                }
            }
            sb.append("Cluster : " + clusterHead + "\n");
            sb.append("Job : " + job.getId() + "\n");
            sb.append("Job Link : http://" + clusterHead + ":5052/spawn2/index.html#jobs/" + job.getId()
                    + "/tasks\n");
            sb.append("Description : " + job.getDescription() + "\n");
            sb.append("------------------------------ \n");
            sb.append("Task Summary \n");
            sb.append("------------------------------ \n");
            sb.append("Job State : " + job.getState() + "\n");
            sb.append("Start Time : " + format(job.getStartTime()) + "\n");
            sb.append("End Time : " + format(job.getEndTime()) + "\n");
            sb.append("Num Nodes : " + numNodes + "\n");
            sb.append("Running Nodes : " + running + "\n");
            sb.append("Errored Nodes : " + errored + "\n");
            sb.append("Done Nodes : " + done + "\n");
            sb.append("Task files : " + files + "\n");
            sb.append("Task Bytes : " + format(bytes) + " GB\n");
            sb.append("------------------------------ \n");
        }
        return sb.toString();
    }

    private static String format(double bytes) {
        double gb = bytes / GIGA_BYTE;

        return decimalFormat.format(gb);
    }

    private static String format(Long time) {
        if (time != null) {
            return dateFormat.format(new Date(time));
        } else {
            return "-";
        }
    }

    /**
     * Send an email when an alert fires or clears.
     *
     * @param jobAlert The alert to modify
     */
    private void emailAlert(AbstractJobAlert jobAlert, String reason, Map<String, String> errors) {
        if (errors.isEmpty()) {
            return;
        }
        String description = jobAlert.description;
        boolean hasDescription = LessStrings.isNotEmpty(description);
        String subject;
        if (hasDescription) {
            subject = reason + ' ' + description;
        } else {
            subject = String.format("%s %s - %s - %s", reason, jobAlert.getTypeString(),
                    JobAlertRunner.getClusterHead(), errors.keySet());
        }
        log.info("Alerting {} :: jobs : {} : {}", jobAlert.email, errors.keySet(), reason);
        StringBuilder sb = new StringBuilder(reason + ' ' + jobAlert.getTypeString() + '\n');
        sb.append(
                "Alert link : http://" + clusterHead + ":5052/spawn2/index.html#alerts/" + jobAlert.alertId + '\n');
        if (hasDescription) {
            sb.append("Alert Description : " + description + '\n');
        }
        for (Map.Entry<String, String> entry : errors.entrySet()) {
            sb.append(summary(spawn.getJob(entry.getKey())) + '\n');
            sb.append("Error Message\n");
            sb.append(entry.getValue());
            sb.append("\n------------------------------\n");
        }
        if (!EmailUtil.email(jobAlert.email, subject, sb.toString())) {
            log.error("Unable to send email for alert {}", jobAlert.alertId);
        }
    }

    private void loadAlertMap() {
        Map<String, String> alertsRaw = spawnDataStore.getAllChildren(SPAWN_COMMON_ALERT_PATH);
        for (Map.Entry<String, String> entry : alertsRaw.entrySet()) {
            // Underscores are used to mark meta-information (for now, whether we have loaded legacy alerts.)
            if (!entry.getKey().startsWith("_")) {
                loadAlert(entry.getKey(), entry.getValue());
            }
        }
        log.info("{} alerts loaded", alertMap.size());
    }

    private void loadAlert(String id, String raw) {
        try {
            AbstractJobAlert jobAlert = CodecJSON.decodeString(AbstractJobAlert.class, raw);
            alertMap.put(id, jobAlert);
            updateJobToAlertsMap(id, null, jobAlert);
        } catch (Exception ex) {
            log.error("Failed to decode JobAlert id={} raw={}", id, raw, ex);
        }
    }

    /**
     * Remove any outdated mappings from a (job + alias) to an alert and
     * insert new mappings. If {@code old} is null then do not remove
     * any mappings. If {@code alert} is null then do not insert any mappings.
     *
     * @param id    alertId
     * @param old   if non-null then remove associations
     * @param alert if non-null then insert associations
     */
    private void updateJobToAlertsMap(@Nonnull String id, @Nullable AbstractJobAlert old,
            @Nullable AbstractJobAlert alert) {
        if (old != null) {
            for (String jobId : old.jobIds) {
                jobToAlertsMap.remove(jobId, id);
            }
        }
        if (alert != null) {
            for (String jobId : alert.jobIds) {
                jobToAlertsMap.put(jobId, id);
            }
        }
    }

    public void putAlert(String id, AbstractJobAlert alert) {
        alertMap.compute(id, (key, old) -> {
            if (old != null) {
                alert.setStateFrom(old);
            }
            updateJobToAlertsMap(id, old, alert);
            storeAlert(id, alert);
            return alert;
        });
    }

    public void removeAlert(String id) {
        if (id != null) {
            alertMap.computeIfPresent(id, (key, value) -> {
                updateJobToAlertsMap(id, value, null);
                storeAlert(id, null);
                return null;
            });
        }
    }

    private void storeAlert(String alertId, @Nullable AbstractJobAlert alert) {
        try {
            if (alert != null) {
                spawnDataStore.putAsChild(SPAWN_COMMON_ALERT_PATH, alertId, CodecJSON.encodeString(alert));
            } else {
                spawnDataStore.deleteChild(SPAWN_COMMON_ALERT_PATH, alertId);
            }
        } catch (Exception e) {
            log.warn("Warning: failed to save alert id={} alert={}", alertId, alert);
        }
    }

    /**
     * Get a snapshot of the alert map as an array, mainly for rendering in the UI.
     *
     * @return A JSONObject representation of all existing alerts
     */
    public JSONArray getAlertStateArray() {
        JSONArray rv = new JSONArray();
        for (AbstractJobAlert jobAlert : alertMap.values()) {
            try {
                rv.put(jobAlert.toJSON());
            } catch (Exception e) {
                log.warn("Warning: failed to send alert in array: {}", jobAlert);
            }
        }
        return rv;
    }

    public JSONObject getAlertStateMap() {
        JSONObject rv = new JSONObject();
        for (AbstractJobAlert jobAlert : alertMap.values()) {
            try {
                rv.put(jobAlert.alertId, jobAlert.toJSON());
            } catch (Exception e) {
                log.warn("Warning: failed to send alert in map: {}", jobAlert);
            }
        }
        return rv;
    }

    public String getAlert(String alertId) {
        try {
            AbstractJobAlert alert = alertMap.get(alertId);
            if (alert == null) {
                return null;
            } else {
                return alert.toJSON().toString();
            }
        } catch (Exception e) {
            log.warn("Failed to fetch alert {}", alertId, e);
            return null;
        }
    }

    public static String getClusterHead() {
        return clusterHead;
    }

    /** Copy and then modify an alert by removing a specific job id. */
    private AbstractJobAlert copyWithoutJobId(@Nonnull String jobId, AbstractJobAlert old) {
        ObjectNode json = Jackson.defaultMapper().valueToTree(old);
        ArrayNode jsonArray = json.putArray("jobIds");
        old.jobIds.stream().filter(x -> !x.equals(jobId)).forEach(jsonArray::add);
        try {
            return Jackson.defaultMapper().treeToValue(json, AbstractJobAlert.class);
        } catch (IOException ex) {
            log.error("Internal error removing job alerts:", ex);
            return old;
        }
    }

    /**
     * Remove {@code jobId} from all alerts that are monitoring it. Delete alerts that are only monitoring this job.
     */
    public void removeAlertsForJob(String jobId) {
        Set<String> alertIds = ImmutableSet.copyOf(jobToAlertsMap.get(jobId));
        for (String mappedAlertId : alertIds) {
            if (alertMap.computeIfPresent(mappedAlertId, (alertId, alert) -> {
                ImmutableList<String> jobIds = alert.jobIds;
                if (jobIds.contains(jobId)) {
                    @Nullable
                    AbstractJobAlert newAlert;
                    if (jobIds.size() == 1) {
                        newAlert = null;
                    } else {
                        newAlert = copyWithoutJobId(jobId, alert);
                    }
                    updateJobToAlertsMap(alertId, alert, newAlert);
                    storeAlert(alertId, newAlert);
                    return newAlert;
                } else {
                    log.warn("jobToAlertsMap has mapping from job {} to alert {} but alert has no reference to job",
                            jobId, alertId);
                    return alert;
                }
            }) == null) {
                log.warn("jobToAlertsMap has mapping from job {} to alert {} but alert does not exist", jobId,
                        mappedAlertId);
            }
        }
    }

}