org.apache.hadoop.corona.ClusterManager.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.corona.ClusterManager.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.corona;

import java.io.File;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.*;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.http.HttpServer;
import org.apache.hadoop.mapred.Clock;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.util.CoronaSerializer;
import org.apache.hadoop.util.HostsFileReader;
import org.apache.thrift.TApplicationException;
import org.apache.thrift.TException;
import org.codehaus.jackson.JsonGenerator;

/**
 * Manager of all the resources of the cluster.
 */
public class ClusterManager implements ClusterManagerService.Iface {
    /** Class logger */
    public static final Log LOG = LogFactory.getLog(ClusterManager.class);
    /** Clock that is used for any general purpose system times. */
    public static Clock clock = new Clock();
    /** 
     * The threshold to control if generating a log to say 
     * somebody try to delete a number of active sessions
     */
    public static final int KILL_SESSIONS_THRESHOLD = 2;

    /** Node manager manages collections of nodes */
    protected NodeManager nodeManager;

    /** Session manager manages collections of sessions */
    protected SessionManager sessionManager;

    /** Sessions history manager */
    protected SessionHistoryManager sessionHistoryManager;

    /** http server */
    protected HttpServer infoServer;

    /** Scheduler service matches free nodes to runnable sessions */
    protected Scheduler scheduler;

    /**
     * The session notifier asynchronously notifies sessions about
     * various events
     */
    protected SessionNotifier sessionNotifier;

    /** Metrics for the cluster manager */
    protected ClusterManagerMetrics metrics;
    /** Configuration */
    protected CoronaConf conf;

    /** Start time to show in UI. */
    protected long startTime;
    /** When was the CM last restarted (either safely or otherwise) */
    protected long lastRestartTime;
    /** Host name to show in UI. */
    protected String hostName;

    /** Legal values for the "type" of a resource request. */
    protected Set<ResourceType> legalTypeSet = EnumSet.noneOf(ResourceType.class);

    /** Is the Cluster Manager in Safe Mode */
    protected volatile boolean safeMode;

    /** the thread to restart all the task trackers */
    protected CoronaNodeRestarter nodeRestarter;

    /**
     * Simple constructor for testing help.
     */
    public ClusterManager() {
    }

    /**
     * Primary constructor.
     *
     * @param conf Configuration to be used
     * @param recoverFromDisk True if we are restarting after going down while
     *                            in Safe Mode
     * @throws IOException
     */
    public ClusterManager(Configuration conf, boolean recoverFromDisk) throws IOException {
        this(new CoronaConf(conf), recoverFromDisk);
    }

    /**
     * Constructor for ClusterManager, when it is not specified if we are
     * restarting after persisting the state. In this case we assume the
     * recoverFromDisk flag to be false.
     *
     * @param conf Configuration to be used
     * @throws IOException
     */
    public ClusterManager(Configuration conf) throws IOException {
        this(new CoronaConf(conf), false);
    }

    /**
     * Construct ClusterManager given {@link CoronaConf}
     *
     * @param conf the configuration for the ClusterManager
     * @param recoverFromDisk true if we are restarting after going down while
     *                        in Safe Mode
     * @throws IOException
     */
    public ClusterManager(CoronaConf conf, boolean recoverFromDisk) throws IOException {
        this.conf = conf;
        HostsFileReader hostsReader = new HostsFileReader(conf.getHostsFile(), conf.getExcludesFile());
        initLegalTypes();
        metrics = new ClusterManagerMetrics(getTypes());

        if (recoverFromDisk) {
            recoverClusterManagerFromDisk(hostsReader);
        } else {
            File stateFile = new File(conf.getCMStateFile());
            if (stateFile.exists()) {
                throw new IOException("State file " + stateFile.getAbsolutePath()
                        + " exists, but recoverFromDisk is not set, delete the state file first");
            }
            LOG.info("Starting Cluster Manager with clean state");
            startTime = clock.getTime();
            lastRestartTime = startTime;
            nodeManager = new NodeManager(this, hostsReader);
            nodeManager.setConf(conf);
            sessionManager = new SessionManager(this);
            sessionNotifier = new SessionNotifier(sessionManager, this, metrics);
        }
        sessionManager.setConf(conf);
        sessionNotifier.setConf(conf);

        sessionHistoryManager = new SessionHistoryManager();
        sessionHistoryManager.setConf(conf);

        scheduler = new Scheduler(nodeManager, sessionManager, sessionNotifier, getTypes(), metrics, conf);
        scheduler.start();
        metrics.registerUpdater(scheduler, sessionNotifier);

        InetSocketAddress infoSocAddr = NetUtils.createSocketAddr(conf.getClusterManagerHttpAddress());
        infoServer = new HttpServer("cm", infoSocAddr.getHostName(), infoSocAddr.getPort(),
                infoSocAddr.getPort() == 0, conf);
        infoServer.setAttribute("cm", this);
        infoServer.start();

        hostName = infoSocAddr.getHostName();

        // We have not completely restored the nodeManager, sessionManager and the
        // sessionNotifier
        if (recoverFromDisk) {
            nodeManager.restoreAfterSafeModeRestart();
            sessionManager.restoreAfterSafeModeRestart();
            sessionNotifier.restoreAfterSafeModeRestart();
        }

        nodeRestarter = new CoronaNodeRestarter(conf, nodeManager);
        nodeRestarter.start();

        setSafeMode(false);
    }

    /**
     * This method is used when the ClusterManager is restarting after going down
     * while in Safe Mode. It starts the process of recovering the original
     * CM state by reading back the state in JSON form.
     * @param hostsReader The HostsReader instance
     * @throws IOException
     */
    private void recoverClusterManagerFromDisk(HostsFileReader hostsReader) throws IOException {
        LOG.info("Restoring state from " + new java.io.File(conf.getCMStateFile()).getAbsolutePath());

        // This will prevent the expireNodes and expireSessions threads from
        // expiring the nodes and sessions respectively
        safeMode = true;
        LOG.info("Safe mode is now: " + (this.safeMode ? "ON" : "OFF"));

        CoronaSerializer coronaSerializer = new CoronaSerializer(conf);

        // Expecting the START_OBJECT token for ClusterManager
        coronaSerializer.readStartObjectToken("ClusterManager");

        coronaSerializer.readField("startTime");
        startTime = coronaSerializer.readValueAs(Long.class);

        coronaSerializer.readField("nodeManager");
        nodeManager = new NodeManager(this, hostsReader, coronaSerializer);
        nodeManager.setConf(conf);

        coronaSerializer.readField("sessionManager");
        sessionManager = new SessionManager(this, coronaSerializer);

        coronaSerializer.readField("sessionNotifier");
        sessionNotifier = new SessionNotifier(sessionManager, this, metrics, coronaSerializer);

        // Expecting the END_OBJECT token for ClusterManager
        coronaSerializer.readEndObjectToken("ClusterManager");

        lastRestartTime = clock.getTime();
    }

    /**
     * Prepare the legal types allowed based on the resources available
     */
    protected void initLegalTypes() {
        Map<Integer, Map<ResourceType, Integer>> cpuToResourcePartitioning = conf.getCpuToResourcePartitioning();

        for (Map.Entry<Integer, Map<ResourceType, Integer>> entry : cpuToResourcePartitioning.entrySet()) {
            for (ResourceType type : entry.getValue().keySet()) {
                legalTypeSet.add(type);
            }
        }
        legalTypeSet = Collections.unmodifiableSet(legalTypeSet);
    }

    public ClusterManagerMetrics getMetrics() {
        return metrics;
    }

    public SessionNotifier getSessionNotifier() {
        return sessionNotifier;
    }

    public SessionManager getSessionManager() {
        return sessionManager;
    }

    public NodeManager getNodeManager() {
        return nodeManager;
    }

    public Scheduler getScheduler() {
        return scheduler;
    }

    public Collection<ResourceType> getTypes() {
        return Collections.unmodifiableCollection(legalTypeSet);
    }

    /**
     * This is a helper method which simply checks if the safe mode flag is
     * turned on. If it is, the method which was called, cannot be executed
     * and, a SafeModeException is thrown.
     * @param methodName
     * @throws SafeModeException
     */
    private void checkSafeMode(String methodName) throws SafeModeException {
        if (safeMode) {
            LOG.info(methodName + "() called while ClusterManager is in Safe Mode");
            throw new SafeModeException();
        }
    }

    @Override
    public PoolInfoStrings getActualPoolInfo(ActualPoolInfoArgs actualPoolInfoArgs)
            throws TException, InvalidPoolInfo, SafeModeException {
        checkSafeMode("getActualPoolInfo");
        PoolInfoStrings userSpecifiedPoolInfo = actualPoolInfoArgs.poolInfoString;
        long jobSizeInfo = actualPoolInfoArgs.jobInputSize;
        ConfigManager configManager = getScheduler().getConfigManager();
        PoolInfo poolInfo = PoolInfo.createPoolInfo(userSpecifiedPoolInfo);
        // Get Redirect pool info
        PoolInfo redirectedPoolInfo = configManager.getRedirect(poolInfo, jobSizeInfo);
        // Validate redirected  pool information
        try {
            PoolGroupManager.checkPoolInfoIfStrict(redirectedPoolInfo, configManager, conf);
        } catch (InvalidSessionHandle ex) {
            throw new InvalidPoolInfo(ex.getHandle());
        }
        PoolInfoStrings actualPoolInfo = PoolInfo.createPoolInfoStrings(redirectedPoolInfo);
        return actualPoolInfo;
    }

    @Override
    public ActualPoolInfoResponse getActualPoolInfoV2(ActualPoolInfoArgs actualPoolInfoArgs)
            throws InvalidPoolInfo, SafeModeException, TException {
        checkSafeMode("getActualPoolInfoV2");
        PoolInfoStrings userSpecifiedPoolInfo = actualPoolInfoArgs.poolInfoString;
        long jobSizeInfo = actualPoolInfoArgs.jobInputSize;
        ConfigManager configManager = getScheduler().getConfigManager();
        PoolInfo poolInfo = PoolInfo.createPoolInfo(userSpecifiedPoolInfo);
        // Get Redirect pool info
        PoolInfo redirectedPoolInfo = configManager.getRedirect(poolInfo, jobSizeInfo);
        // Validate redirected  pool information
        try {
            PoolGroupManager.checkPoolInfoIfStrict(redirectedPoolInfo, configManager, conf);
        } catch (InvalidSessionHandle ex) {
            throw new InvalidPoolInfo(ex.getHandle());
        }

        PoolInfoStrings actualPoolInfo = PoolInfo.createPoolInfoStrings(redirectedPoolInfo);
        ActualPoolInfoResponse actualPoolInfoResponse = new ActualPoolInfoResponse();
        actualPoolInfoResponse.poolInfoString = actualPoolInfo;
        actualPoolInfoResponse.whitelist = configManager.getWhitelist(redirectedPoolInfo);
        return actualPoolInfoResponse;
    }

    @Override
    public String getNextSessionId() throws SafeModeException {
        checkSafeMode("getNextSessionId");
        return sessionManager.getNextSessionId();
    }

    @Override
    public SessionRegistrationData sessionStart(String handle, SessionInfo info)
            throws TException, InvalidSessionHandle, SafeModeException {
        checkSafeMode("sessionStart");
        String sessionLogPath = sessionHistoryManager.getLogPath(handle);
        Session session = sessionManager.addSession(handle, info);
        return new SessionRegistrationData(session.getHandle(), new ClusterManagerInfo("", sessionLogPath),
                PoolInfo.createPoolInfoStrings(session.getPoolInfo()));
    }

    @Override
    public void sessionEnd(String handle, SessionStatus status)
            throws TException, InvalidSessionHandle, SafeModeException {
        checkSafeMode("sessionEnd");
        try {
            Session session = sessionManager.getSession(handle);
            InetAddress sessionAddr = session.getAddress();
            LOG.info("sessionEnd called for session: " + handle + " on " + sessionAddr.getHost() + ":"
                    + sessionAddr.getPort() + " with status: " + status);
            if (status == SessionStatus.TIMED_OUT) {
                if (session.getUrl() != null && session.getUrl().indexOf(handle) < 0) {
                    metrics.timeoutRemoteJT(1);
                }
            }
            if (status == SessionStatus.FAILED_JOBTRACKER) {
                metrics.recordCJTFailure();
            }

            Collection<ResourceGrant> canceledGrants = sessionManager.deleteSession(handle, status);

            if (canceledGrants == null) {
                return;
            }

            for (ResourceGrant grant : canceledGrants) {
                nodeManager.cancelGrant(grant.nodeName, handle, grant.id);
                metrics.releaseResource(grant.type);
            }

            scheduler.notifyScheduler();
            sessionNotifier.deleteSession(handle);

        } catch (RuntimeException e) {
            LOG.error("Error in sessionEnd of " + handle, e);
            throw new TApplicationException(e.getMessage());
        }
    }

    @Override
    public void sessionUpdateInfo(String handle, SessionInfo info)
            throws TException, InvalidSessionHandle, SafeModeException {
        checkSafeMode("sessionUpdateInfo");
        try {
            LOG.info("sessionUpdateInfo called for session: " + handle + " with info: " + info);
            sessionManager.heartbeat(handle);
            sessionManager.updateInfo(handle, info);
        } catch (RuntimeException e) {
            throw new TApplicationException(e.getMessage());
        }
    }

    @Override
    public void sessionHeartbeat(String handle) throws TException, InvalidSessionHandle, SafeModeException {
        checkSafeMode("sessionHeartbeat");
        try {
            sessionManager.heartbeat(handle);
        } catch (RuntimeException e) {
            throw new TApplicationException(e.getMessage());
        }
    }

    @Override
    public void sessionHeartbeatV2(String handle, HeartbeatArgs jtInfo)
            throws TException, InvalidSessionHandle, SafeModeException {
        checkSafeMode("sessionHeartbeatV2");
        try {
            Session session = sessionManager.getSession(handle);
            if (!session.checkHeartbeatInfo(jtInfo)) {
                sessionEnd(session.getSessionId(), SessionStatus.FAILED_JOBTRACKER);
            }
            sessionManager.heartbeatV2(handle, jtInfo);
        } catch (RuntimeException e) {
            throw new TApplicationException(e.getMessage());
        }
    }

    /**
     * Check all the resource requests and ensure that they are legal.
     *
     * @param requestList List of resource requests to check
     * @return True if the resources are legal, false otherwise
     */
    protected boolean checkResourceRequestType(List<ResourceRequest> requestList) {
        for (ResourceRequest req : requestList) {
            if (!legalTypeSet.contains(req.type)) {
                return false;
            }
        }
        return true;
    }

    protected boolean checkResourceRequestExcluded(List<ResourceRequest> requestList) {
        Set<String> excluded = new HashSet<String>();
        for (ResourceRequest req : requestList) {
            if (req.getExcludeHosts() == null || req.getHosts() == null) {
                continue;
            }
            excluded.clear();
            excluded.addAll(req.getExcludeHosts());
            for (String host : req.getHosts()) {
                if (excluded.contains(host)) {
                    return false;
                }
            }
        }
        return true;
    }

    /**
     * Count the resources requested and fail the job if they are above the limit
     *
     * @param requestList List of resource requests to check
     */
    protected void checkResourceRequestLimit(List<ResourceRequest> requestList, String handle)
            throws InvalidSessionHandle {
        ConfigManager configManager = getScheduler().getConfigManager();
        Session session = sessionManager.getSession(handle);
        PoolInfo poolInfo = session.getPoolInfo();

        // Only check the resource requests if this pool is configured to not
        // accept more than a fixed number of requests at the same time
        if (!configManager.useRequestMax(poolInfo)) {
            return;
        }

        // Count the resources by type
        ResourceTypeCounter resourceTypeCounter = new ResourceTypeCounter();
        for (ResourceRequest req : requestList) {
            resourceTypeCounter.incr(req.type);
        }

        // No resource type request should exceed the maximum
        for (ResourceType resourceType : ResourceType.values()) {
            if (configManager.getPoolMaximum(poolInfo, resourceType) < resourceTypeCounter.getCount(resourceType)) {
                String failureMessage = "Session " + handle + " requested "
                        + resourceTypeCounter.getCount(resourceType) + " resources for resource type "
                        + resourceType + " but was only allowed "
                        + configManager.getPoolMaximum(poolInfo, resourceType) + ", " + "so failing the job";
                LOG.error(failureMessage);
                throw new InvalidSessionHandle(failureMessage);
            }
        }
    }

    @Override
    public void requestResource(String handle, List<ResourceRequest> requestList)
            throws TException, InvalidSessionHandle, SafeModeException {
        checkSafeMode("requestResource");
        try {
            LOG.info("Request " + requestList.size() + " resources from session: " + handle);
            if (!checkResourceRequestType(requestList)) {
                LOG.error("Bad resource type from session: " + handle);
                throw new TApplicationException("Bad resource type");
            }
            if (!checkResourceRequestExcluded(requestList)) {
                LOG.error("Bad excluded hosts from session: " + handle);
                throw new TApplicationException("Requesting excluded hosts");
            }
            checkResourceRequestLimit(requestList, handle);

            sessionManager.heartbeat(handle);
            sessionManager.getSession(handle).setResourceRequest(requestList);
            List<ResourceRequestInfo> reqInfoList = new ArrayList<ResourceRequestInfo>(requestList.size());
            for (ResourceRequest request : requestList) {
                List<String> hosts = request.getHosts();
                List<RequestedNode> requestedNodes = null;
                if (hosts != null && hosts.size() > 0) {
                    requestedNodes = new ArrayList<RequestedNode>(hosts.size());
                    for (String host : hosts) {
                        requestedNodes.add(nodeManager.resolve(host, request.type));
                    }
                }
                ResourceRequestInfo info = new ResourceRequestInfo(request, requestedNodes);
                reqInfoList.add(info);
            }
            sessionManager.requestResource(handle, reqInfoList);
            for (ResourceRequest req : requestList) {
                metrics.requestResource(req.type);
            }
            scheduler.notifyScheduler();
        } catch (RuntimeException e) {
            e.printStackTrace();
            throw new TApplicationException(e.getMessage());
        }
    }

    @Override
    public void releaseResource(String handle, List<Integer> idList)
            throws TException, InvalidSessionHandle, SafeModeException {
        checkSafeMode("releaseResource");
        try {
            LOG.info("Release " + idList.size() + " resources from session: " + handle);
            sessionManager.heartbeat(handle);
            Collection<ResourceGrant> canceledGrants = sessionManager.releaseResource(handle, idList);

            if (canceledGrants == null) {
                // LOG.info("No canceled grants for session " + handle);
                return;
            }

            for (ResourceGrant grant : canceledGrants) {
                nodeManager.cancelGrant(grant.nodeName, handle, grant.id);
                metrics.releaseResource(grant.type);
            }

            scheduler.notifyScheduler();
        } catch (RuntimeException e) {
            throw new TApplicationException(e.getMessage());
        }
    }

    @Override
    public NodeHeartbeatResponse nodeHeartbeat(ClusterNodeInfo node)
            throws TException, DisallowedNode, SafeModeException {
        checkSafeMode("nodeHeartbeat");
        //LOG.info("heartbeat from node: " + node.toString());
        if (nodeManager.heartbeat(node)) {
            scheduler.notifyScheduler();
        }
        NodeHeartbeatResponse nodeHeartbeatResponse = new NodeHeartbeatResponse();
        if (nodeRestarter != null && nodeRestarter.checkStatus(node)) {
            nodeHeartbeatResponse.setRestartFlag(true);
        } else {
            nodeHeartbeatResponse.setRestartFlag(false);
        }
        return nodeHeartbeatResponse;
    }

    @Override
    public void nodeFeedback(String handle, List<ResourceType> resourceTypes, List<NodeUsageReport> reportList)
            throws TException, InvalidSessionHandle, SafeModeException {
        checkSafeMode("nodeFeedback");
        LOG.info("Received feedback from session " + handle);
        nodeManager.nodeFeedback(handle, resourceTypes, reportList);
    }

    @Override
    public void refreshNodes() throws TException, SafeModeException {
        checkSafeMode("refreshNodes");
        try {
            nodeManager.refreshNodes();
        } catch (IOException e) {
            throw new TException(e);
        }
    }

    @Override
    public RestartNodesResponse restartNodes(RestartNodesArgs restartNodesArgs)
            throws TException, SafeModeException {
        checkSafeMode("restartNode");
        LOG.info("Got request to restart all the cluster nodes with batch size " + restartNodesArgs.getBatchSize());
        List<ClusterNode> allNodes = nodeManager.getAliveClusterNodes();
        if (allNodes.size() > 0 && nodeRestarter != null) {
            nodeRestarter.add(allNodes, restartNodesArgs.isForce(), restartNodesArgs.getBatchSize());
        } else {
            LOG.info("There is no cluster node to restart");
        }
        RestartNodesResponse restartNodesResponse = new RestartNodesResponse();
        return restartNodesResponse;
    }

    /**
     * Sets the Safe Mode flag on the Cluster Manager, and on the ProxyJobTracker.
     * If we fail to set the flag on the ProxyJobTracker, return false, which
     * signals that setting the flag on the ProxyJobTracker failed. In that case,
     * we should run coronaadmin with the -forceSetSafeModeOnPJT or
     * -forceUnsetSafeModeOnPJT options.
     *
     * If we call this function multiple times, it wouldn't matter, because all
     * operations (apart from resetting of the last heartbeat time) in this
     * function, and in the setClusterManagerSafeModeFlag function in the
     * ProxyJobTracker are idempotent.
     *
     * @param safeMode The value of Safe Mode flag that we want to be set.
     * @return true, if setting the Safe Mode flag succeeded, false otherwise.
     */
    @Override
    public synchronized boolean setSafeMode(boolean safeMode) {
        /**
         * If we are switching off the safe mode, so we need to reset the last
         * heartbeat timestamp for each of the sessions and nodes.
         */
        if (safeMode == false) {
            LOG.info("Resetting the heartbeat times for all sessions");
            sessionManager.resetSessionsLastHeartbeatTime();
            LOG.info("Resetting the heartbeat times for all nodes");
            nodeManager.resetNodesLastHeartbeatTime();
            /**
             * If we are setting the safe mode to false, we should first set it
             * in-memory, before we set it at the CPJT.
             */
            this.safeMode = false;
        }
        try {
            ClusterManagerAvailabilityChecker.getPJTClient(conf).setClusterManagerSafeModeFlag(safeMode);
        } catch (IOException e) {
            LOG.info("Exception while setting the safe mode flag in ProxyJobTracker: " + e.getMessage());
            return false;
        } catch (TException e) {
            LOG.info("Exception while setting the safe mode flag in ProxyJobTracker: " + e.getMessage());
            return false;
        }
        this.safeMode = safeMode;
        LOG.info("Flag successfully set in ProxyJobTracker");
        LOG.info("Safe mode is now: " + (this.safeMode ? "ON" : "OFF"));
        return true;
    }

    /**
     * Get the current safe mode setting.
     */
    public boolean getSafeMode() {
        return safeMode;
    }

    /**
     * This function saves the state of the ClusterManager to disk.
     * @return A boolean. True if saving the state succeeded, false otherwise.
     */
    @Override
    public boolean persistState() {
        if (!safeMode) {
            LOG.info("Cannot persist state because ClusterManager is not in Safe Mode");
            return false;
        }

        try {
            JsonGenerator jsonGenerator = CoronaSerializer.createJsonGenerator(conf);
            jsonGenerator.writeStartObject();

            jsonGenerator.writeFieldName("startTime");
            jsonGenerator.writeNumber(startTime);

            jsonGenerator.writeFieldName("nodeManager");
            nodeManager.write(jsonGenerator);

            jsonGenerator.writeFieldName("sessionManager");
            sessionManager.write(jsonGenerator);

            jsonGenerator.writeFieldName("sessionNotifier");
            sessionNotifier.write(jsonGenerator);

            jsonGenerator.writeEndObject();
            jsonGenerator.close();
        } catch (IOException e) {
            LOG.info("Could not persist the state: ", e);
            return false;
        }
        return true;
    }

    @Override
    public List<RunningSession> getSessions() throws TException, SafeModeException {
        checkSafeMode("getSessions");
        List<RunningSession> runningSessions = new LinkedList<RunningSession>();
        Set<String> sessions = sessionManager.getSessions();
        for (String sessionId : sessions) {
            try {
                Session session = sessionManager.getSession(sessionId);

                synchronized (session) {
                    RunningSession runningSession = new RunningSession(session.getHandle(), session.getName(),
                            session.getUserId(), PoolInfo.createPoolInfoStrings(session.getPoolInfo()));
                    runningSession.setDeadline(session.getDeadline());
                    runningSession.setPriority(session.getInfo().getPriority());
                    Map<ResourceType, Integer> runningResources = new EnumMap<ResourceType, Integer>(
                            ResourceType.class);
                    for (ResourceType type : ResourceType.values()) {
                        runningResources.put(type, session.getGrantCountForType(type));
                    }
                    runningSession.setRunningResources(runningResources);
                    runningSessions.add(runningSession);
                }
            } catch (InvalidSessionHandle invalidSessionHandle) {
                // This is no big deal, just means that the session has finished
            }
        }
        return runningSessions;
    }

    @Override
    public SessionInfo getSessionInfo(String handle) throws TException, InvalidSessionHandle, SafeModeException {
        checkSafeMode("getSessionInfo");
        Session session = sessionManager.getSession(handle);
        return session.getInfo();
    }

    @Override
    public void killSession(String sessionId) throws TException, SafeModeException {
        checkSafeMode("killSession");
        try {
            LOG.info("Killing session " + sessionId);
            sessionEnd(sessionId, SessionStatus.KILLED);
        } catch (InvalidSessionHandle e) {
            throw new TException(e);
        }
    }

    @Override
    public void killSessions(KillSessionsArgs killSessionsArgs) throws SafeModeException, TException {
        StringBuilder msg = new StringBuilder();
        msg.append(killSessionsArgs.who);
        msg.append(" killed session");

        int killed = 0;

        for (String id : killSessionsArgs.sessionIds) {
            try {
                killSession(id);

                ++killed;
                msg.append(" ");
                msg.append(id);
            } finally {
                if (killed >= KILL_SESSIONS_THRESHOLD) {
                    LOG.info(msg);
                }
            }
        }
    }

    /**
     * This is an internal api called to tell the cluster manager that a
     * a particular node seems dysfunctional and that it should be removed
     * from the cluster.
     *
     * @param nodeName Node to be removed
     */
    public void nodeTimeout(String nodeName) {
        if (nodeRestarter != null) {
            nodeRestarter.delete(nodeName);
        }
        Set<String> sessions = nodeManager.getNodeSessions(nodeName);
        Set<ClusterNode.GrantId> grantsToRevoke = nodeManager.deleteNode(nodeName);
        if (grantsToRevoke == null) {
            return;
        }
        handleRevokedGrants(nodeName, grantsToRevoke);
        handleDeadNode(nodeName, sessions);
        scheduler.notifyScheduler();
    }

    /**
     * This is an internal api called to tell the cluster manager that a
     * particular node is excluded from the cluster.
     *
     * @param nodeName
     *          Node to be removed
     */
    public void nodeDecommisioned(String nodeName) {
        LOG.info("Node decommissioned: " + nodeName);
        // The logic for decommisioning is the same as that for a timeout.
        nodeTimeout(nodeName);
    }

    /**
     * This is an internal api called to tell the cluster manager that a
     * particular type of resource is no longer available on a node.
     *
     * @param nodeName
     *          Name of the node on which the resource is removed.
     * @param type
     *          The type of resource to be removed.
     */
    public void nodeAppRemoved(String nodeName, ResourceType type) {
        Set<String> sessions = nodeManager.getNodeSessions(nodeName);
        Set<ClusterNode.GrantId> grantsToRevoke = nodeManager.deleteAppFromNode(nodeName, type);
        if (grantsToRevoke == null) {
            return;
        }
        Set<String> affectedSessions = new HashSet<String>();
        for (String sessionHandle : sessions) {
            try {
                if (sessionManager.getSession(sessionHandle).getTypes().contains(type)) {
                    affectedSessions.add(sessionHandle);
                }
            } catch (InvalidSessionHandle ex) {
                // ignore
                LOG.warn("Found invalid session: " + sessionHandle + " while timing out node: " + nodeName);
            }
        }
        handleDeadNode(nodeName, affectedSessions);
        handleRevokedGrants(nodeName, grantsToRevoke);
        scheduler.notifyScheduler();
    }

    /**
     * Process the grants removed from a node.
     *
     * @param nodeName
     *          The node name.
     * @param grantsToRevoke
     *          The grants to revoke.
     */
    private void handleRevokedGrants(String nodeName, Set<ClusterNode.GrantId> grantsToRevoke) {
        for (ClusterNode.GrantId grantId : grantsToRevoke) {
            String sessionHandle = grantId.getSessionId();

            try {
                sessionManager.revokeResource(sessionHandle, Collections.singletonList(grantId.getRequestId()));
            } catch (InvalidSessionHandle e) {
                // ignore
                LOG.warn("Found invalid session: " + sessionHandle + " while timing out node: " + nodeName);
            }
        }
    }

    /**
     * All the sessions that had grants on this node should get notified
     * @param nodeName the name of the node that went dead
     */
    private void handleDeadNode(String nodeName, Set<String> sessions) {
        LOG.info("Notify sessions: " + sessions + " about dead node " + nodeName);
        for (String session : sessions) {
            sessionNotifier.notifyDeadNode(session, nodeName);
        }
    }

    public long getStartTime() {
        return startTime;
    }

    /**
     * Returns the last time the CM was restarted either safely, or otherwise.
     * @return Milliseconds since last restart
     */
    public long getLastRestartTime() {
        return lastRestartTime;
    }

    public String getHostName() {
        return hostName;
    }

}