azkaban.sla.SLAManager.java Source code

Java tutorial

Introduction

Here is the source code for azkaban.sla.SLAManager.java

Source

package azkaban.sla;

import java.lang.Thread.State;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.log4j.Logger;
import org.joda.time.DateTime;

import azkaban.executor.ExecutableFlow;
import azkaban.executor.ExecutableFlow.Status;
import azkaban.executor.ExecutableNode;
import azkaban.executor.ExecutorManager;
import azkaban.executor.ExecutorManagerException;
import azkaban.sla.SLA.SlaAction;
import azkaban.sla.SLA.SlaRule;
import azkaban.sla.SLA.SlaSetting;
import azkaban.utils.Props;

/*
 * Copyright 2012 LinkedIn, Inc
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

/**
 * The SLAManager stores and checks the SLA (Service Level Agreement). It uses a single thread
 * instead and waits until correct check time for the flow, and individual jobs in the flow if their SLA is set. 
 */
public class SLAManager {
    private static Logger logger = Logger.getLogger(SLAManager.class);

    private SLALoader loader;

    private final SLARunner runner;
    private final ExecutorManager executorManager;
    private SlaMailer mailer;

    private long lastCheckTime = -1;

    /**
     * Give the sla manager a loader class that will properly load the
     * sla.
     * 
     * @param loader
     * @throws SLAManagerException 
     */
    public SLAManager(ExecutorManager executorManager, SLALoader loader, Props props) throws SLAManagerException {
        this.executorManager = executorManager;
        this.loader = loader;
        this.mailer = new SlaMailer(props);
        this.runner = new SLARunner();

        List<SLA> SLAList = null;
        try {
            SLAList = loader.loadSLAs();
        } catch (SLAManagerException e) {
            // TODO Auto-generated catch block
            throw e;
        }

        for (SLA sla : SLAList) {
            runner.addRunnerSLA(sla);
        }

        this.runner.start();
    }

    /**
     * Shutdowns the sla thread. After shutdown, it may not be safe to use
     * it again.
     */
    public void shutdown() {
        this.runner.shutdown();
    }

    /**
     * Removes the flow from the SLA if it exists.
     * 
     * @param id
     * @throws SLAManagerException 
     */
    public void removeSLA(SLA s) throws SLAManagerException {
        logger.info("Removing SLA " + s.toString());
        runner.removeRunnerSLA(s);
        loader.removeSLA(s);
    }

    public void submitSla(int execId, String id, DateTime checkTime, List<String> emails,
            List<SlaAction> slaActions, List<SlaSetting> jobSettings, SlaRule slaRule) throws SLAManagerException {
        SLA s = new SLA(execId, id, checkTime, emails, slaActions, jobSettings, slaRule);
        logger.info("Submitting SLA " + s.toString());
        try {
            loader.insertSLA(s);
            runner.addRunnerSLA(s);
        } catch (SLAManagerException e) {
            throw new SLAManagerException("Failed to add new SLA!" + e.getCause());
        }
    }

    /**
     * Thread that simply invokes the checking of flows when the SLA is
     * ready.
     * 
     */
    public class SLARunner extends Thread {
        private final PriorityBlockingQueue<SLA> SLAs;
        private AtomicBoolean stillAlive = new AtomicBoolean(true);

        // Five minute minimum intervals
        private static final int TIMEOUT_MS = 60000;

        public SLARunner() {
            SLAs = new PriorityBlockingQueue<SLA>(1, new SLAComparator());
        }

        public void shutdown() {
            logger.error("Shutting down SLA runner thread");
            stillAlive.set(false);
            this.interrupt();
        }

        /**
         * Return a list of flow with SLAs
         * 
         * @return
         */
        protected synchronized List<SLA> getRunnerSLAs() {
            return new ArrayList<SLA>(SLAs);
        }

        /**
         * Adds SLA into runner and then interrupts so it will update
         * its wait time.
         * 
         * @param flow
         */
        public synchronized void addRunnerSLA(SLA s) {
            logger.info("Adding " + s + " to SLA runner.");
            SLAs.add(s);
            this.interrupt();
        }

        /**
         * Remove runner SLA. Does not interrupt.
         * 
         * @param flow
         * @throws SLAManagerException 
         */
        public synchronized void removeRunnerSLA(SLA s) {
            logger.info("Removing " + s + " from the SLA runner.");
            SLAs.remove(s);
        }

        public void run() {
            while (stillAlive.get()) {
                synchronized (this) {
                    try {
                        lastCheckTime = System.currentTimeMillis();

                        // TODO clear up the exception handling
                        SLA s = SLAs.peek();

                        if (s == null) {
                            // If null, wake up every minute or so to see if
                            // there's something to do. Most likely there will not be.
                            try {
                                this.wait(TIMEOUT_MS);
                            } catch (InterruptedException e) {
                                // interruption should occur when items are added or removed from the queue.
                            }
                        } else {
                            // We've passed the flow execution time, so we will run.
                            if (!(new DateTime(s.getCheckTime())).isAfterNow()) {
                                // Run flow. The invocation of flows should be quick.
                                SLA runningSLA = SLAs.poll();

                                logger.info("Checking sla " + runningSLA.toString());

                                int execId = s.getExecId();
                                ExecutableFlow exflow = executorManager.getExecutableFlow(execId);

                                if (runningSLA.getJobName().equals("")
                                        && runningSLA.getRule().equals(SlaRule.WAITANDCHECKJOB)) {
                                    // do the checking of potential jobsla submissions
                                    List<SlaSetting> jobSettings = runningSLA.getJobSettings();
                                    List<SlaSetting> removeSettings = new ArrayList<SLA.SlaSetting>();
                                    for (SlaSetting set : jobSettings) {
                                        ExecutableNode node = exflow.getExecutableNode(set.getId());
                                        if (node != null) {
                                            if (node.getStartTime() != -1 || executorManager.isFinished(exflow)) {
                                                submitSla(execId, set.getId(),
                                                        new DateTime(node.getStartTime()).plus(set.getDuration()),
                                                        runningSLA.getEmails(), set.getActions(), null,
                                                        set.getRule());
                                                removeSettings.add(set);
                                                logger.info(
                                                        "Job " + set.getId() + " already started, monitoring SLA.");
                                            }
                                        } else {
                                            mailer.sendSlaEmail(s,
                                                    "The SLA setting for flow/job is no longer valid as flow structure has changed. Execution "
                                                            + s.getExecId());
                                            removeSettings.add(set);

                                        }
                                    }
                                    for (SlaSetting remove : removeSettings) {
                                        jobSettings.remove(remove);
                                    }
                                    if (jobSettings.size() == 0) {
                                        removeRunnerSLA(runningSLA);
                                        loader.removeSLA(runningSLA);
                                    } else {
                                        removeRunnerSLA(runningSLA);
                                        loader.removeSLA(runningSLA);
                                        runningSLA.setCheckTime(runningSLA.getCheckTime().plusMillis(TIMEOUT_MS));
                                        addRunnerSLA(runningSLA);
                                        loader.insertSLA(runningSLA);
                                    }
                                } else {
                                    if (!metSla(runningSLA, exflow)) {
                                        takeSLAFailActions(runningSLA, exflow);
                                    } else {
                                        takeSLASuccessActions(runningSLA, exflow);
                                    }

                                    removeRunnerSLA(runningSLA);
                                    loader.removeSLA(runningSLA);
                                }
                            } else {
                                // wait until flow run
                                long millisWait = Math.max(0,
                                        s.getCheckTime().getMillis() - (new DateTime()).getMillis());
                                try {
                                    this.wait(Math.min(millisWait, TIMEOUT_MS));
                                } catch (InterruptedException e) {
                                    // interruption should occur when items are
                                    // added or removed from the queue.
                                }
                            }
                        }
                    } catch (Exception e) {
                        logger.error("Unexpected exception has been thrown in scheduler", e);
                    } catch (Throwable e) {
                        logger.error("Unexpected throwable has been thrown in scheduler", e);
                    }
                }
            }
        }

        private boolean metSla(SLA s, ExecutableFlow exflow) {
            SlaRule rule = s.getRule();
            long finishTime;
            Status status;
            if (s.getJobName().equals("")) {
                finishTime = exflow.getEndTime();
                status = exflow.getStatus();
            } else {
                ExecutableNode exnode = exflow.getExecutableNode(s.getJobName());
                finishTime = exnode.getEndTime();
                status = exnode.getStatus();
            }

            switch (rule) {
            case FINISH: // check finish time
                return finishTime != -1 && finishTime < s.getCheckTime().getMillis();
            case SUCCESS: // check finish and successful
                return status == Status.SUCCEEDED && finishTime < s.getCheckTime().getMillis();
            default:
                logger.error("Unknown SLA rules!");
                return false;
            }
        }

        /**
         * Class to sort the sla based on time.
         * 
         */
        private class SLAComparator implements Comparator<SLA> {
            @Override
            public int compare(SLA arg0, SLA arg1) {
                long first = arg1.getCheckTime().getMillis();
                long second = arg0.getCheckTime().getMillis();

                if (first == second) {
                    return 0;
                } else if (first < second) {
                    return 1;
                }

                return -1;
            }
        }
    }

    private void takeSLAFailActions(SLA s, ExecutableFlow exflow) {
        logger.info("SLA " + s.toString() + " missed! Taking predefined actions");
        List<SlaAction> actions = s.getActions();
        for (SlaAction act : actions) {
            if (act.equals(SlaAction.EMAIL)) {
                try {
                    sendSlaAlertEmail(s, exflow);
                } catch (Exception e) {
                    logger.error("Failed to send out SLA alert email. " + e.getCause());
                }
            } else if (act.equals(SlaAction.KILL)) {
                try {
                    executorManager.cancelFlow(exflow, "azkaban");
                    sendSlaKillEmail(s, exflow);
                } catch (ExecutorManagerException e) {
                    // TODO Auto-generated catch block
                    logger.error("Cancel flow failed." + e.getCause());
                }
            }
        }
    }

    private void takeSLASuccessActions(SLA s, ExecutableFlow exflow) {
        //sendSlaSuccessEmail(s, exflow);

    }

    private void sendSlaAlertEmail(SLA s, ExecutableFlow exflow) {
        String message = null;
        ExecutableNode exnode;
        switch (s.getRule()) {
        case FINISH:
            if (s.getJobName().equals("")) {
                message = "Flow " + exflow.getFlowId() + " failed to finish with set SLA." + String.format("%n");
                message += "Flow started at " + new DateTime(exflow.getStartTime()).toDateTimeISO()
                        + String.format("%n");
                message += "Flow status at " + s.getCheckTime().toDateTimeISO() + " is " + exflow.getStatus();
            } else {
                exnode = exflow.getExecutableNode(s.getJobName());
                message = "Job " + s.getJobName() + " of flow " + exflow.getFlowId()
                        + " failed to finish with set SLA." + String.format("%n");
                message += "Job started at " + new DateTime(exnode.getStartTime()).toDateTimeISO()
                        + String.format("%n");
                message += "Job status at " + s.getCheckTime().toDateTimeISO() + " is " + exnode.getStatus();
            }
            break;
        case SUCCESS:
            if (s.getJobName().equals("")) {
                message = "Flow " + exflow.getFlowId() + " didn't finish successfully with set SLA. "
                        + String.format("%n");
                message += "Flow started at " + new DateTime(exflow.getStartTime()).toDateTimeISO()
                        + String.format("  %n");
                message += "Flow status at " + s.getCheckTime().toDateTimeISO() + " is " + exflow.getStatus();
            } else {
                exnode = exflow.getExecutableNode(s.getJobName());
                message = "Job " + s.getJobName() + " of flow " + exflow.getFlowId()
                        + " didn't finish successfully with set SLA." + String.format("%n");
                message += "Job started at " + new DateTime(exnode.getStartTime()).toDateTimeISO()
                        + String.format("%n");
                message += "Job status at " + s.getCheckTime().toDateTimeISO() + " is " + exnode.getStatus();
            }
            break;
        default:
            logger.error("Unknown SLA rules!");
            message = "Unknown SLA was not met!";
            break;
        }
        mailer.sendSlaEmail(s, message);
    }

    private void sendSlaSuccessEmail(SLA s, ExecutableFlow exflow) {
        String message = null;
        ExecutableNode exnode;
        switch (s.getRule()) {
        case FINISH:
            if (s.getJobName().equals("")) {
                message = "Flow " + exflow.getFlowId() + " finished within the set SLA." + String.format("%n");
                message += "Flow started at " + new DateTime(exflow.getStartTime()).toDateTimeISO()
                        + String.format("%n");
                message += "Flow status at " + s.getCheckTime().toDateTimeISO() + " is " + exflow.getStatus();
            } else {
                exnode = exflow.getExecutableNode(s.getJobName());
                message = "Job " + s.getJobName() + " of flow " + exflow.getFlowId()
                        + " finished within the set SLA." + String.format("%n");
                message += "Job started at " + new DateTime(exnode.getStartTime()).toDateTimeISO()
                        + String.format("%n");
                message += "Job status at " + s.getCheckTime().toDateTimeISO() + " is " + exnode.getStatus();
            }
            break;
        case SUCCESS:
            if (s.getJobName().equals("")) {
                message = "Flow " + exflow.getFlowId() + " successfully finished within the set SLA."
                        + String.format("%n");
                message += "Flow started at " + new DateTime(exflow.getStartTime()).toDateTimeISO()
                        + String.format("  %n");
                message += "Flow status at " + s.getCheckTime().toDateTimeISO() + " is " + exflow.getStatus();
            } else {
                exnode = exflow.getExecutableNode(s.getJobName());
                message = "Job " + s.getJobName() + " of flow " + exflow.getFlowId()
                        + " successfully finished within the set SLA." + String.format("%n");
                message += "Job started at " + new DateTime(exnode.getStartTime()).toDateTimeISO()
                        + String.format("%n");
                message += "Job status at " + s.getCheckTime().toDateTimeISO() + " is " + exnode.getStatus();
            }
            break;
        default:
            logger.error("Unknown SLA rules!");
            message = "Unknown SLA was not met!";
            break;
        }
        mailer.sendSlaEmail(s, message);
    }

    private void sendSlaKillEmail(SLA s, ExecutableFlow exflow) {
        String message = null;
        ExecutableNode exnode;
        switch (s.getRule()) {
        case FINISH:
            if (s.getJobName().equals("")) {
                message = "Flow " + exflow.getFlowId() + " failed to finish with set SLA and is killed. "
                        + String.format("%n");
                message += "Flow started at " + new DateTime(exflow.getStartTime()).toDateTimeISO()
                        + String.format("%n");
                message += "Flow status at " + s.getCheckTime().toDateTimeISO() + " is " + exflow.getStatus();
            } else {
                exnode = exflow.getExecutableNode(s.getJobName());
                message = "Job " + s.getJobName() + " of flow " + exflow.getFlowId()
                        + " failed to finish with set SLA and is killed. " + String.format("%n");
                message += "Job started at " + new DateTime(exnode.getStartTime()).toDateTimeISO()
                        + String.format("%n");
                message += "Job status at " + s.getCheckTime().toDateTimeISO() + " is " + exnode.getStatus();
            }
            break;
        case SUCCESS:
            if (s.getJobName().equals("")) {
                message = "Flow " + exflow.getFlowId() + " didn't finish successfully with set SLA and is killed. "
                        + String.format("%n");
                message += "Flow started at " + new DateTime(exflow.getStartTime()).toDateTimeISO()
                        + String.format("  %n");
                message += "Flow status at " + s.getCheckTime().toDateTimeISO() + " is " + exflow.getStatus();
            } else {
                exnode = exflow.getExecutableNode(s.getJobName());
                message = "Job " + s.getJobName() + " of flow " + exflow.getFlowId()
                        + " didn't finish successfully with set SLA and is killed. " + String.format("%n");
                message += "Job started at " + new DateTime(exnode.getStartTime()).toDateTimeISO()
                        + String.format("%n");
                message += "Job status at " + s.getCheckTime().toDateTimeISO() + " is " + exnode.getStatus();
            }
            break;
        default:
            logger.error("Unknown SLA rules!");
            message = "Unknown SLA was not met!";
            break;
        }
        mailer.sendSlaEmail(s, message);
    }

    public int getNumActiveSLA() {
        return runner.getRunnerSLAs().size();
    }

    public State getSLAThreadState() {
        return runner.getState();
    }

    public boolean isThreadActive() {
        return runner.isAlive();
    }

    public List<SLA> getSLAList() {
        return runner.getRunnerSLAs();
    }

    public long getLastCheckTime() {
        return lastCheckTime;
    }
}