ch.epfl.eagle.daemon.nodemonitor.NodeMonitor.java Source code

Java tutorial

Introduction

Here is the source code for ch.epfl.eagle.daemon.nodemonitor.NodeMonitor.java

Source

/*
 * EAGLE 
 *
 * Copyright 2016 Operating Systems Laboratory EPFL
 *
 * Modified from Sparrow - University of California, Berkeley 
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package ch.epfl.eagle.daemon.nodemonitor;

import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ConcurrentMap;

import org.apache.commons.configuration.Configuration;
import org.apache.log4j.Logger;
import org.apache.thrift.TException;
import org.apache.thrift.async.AsyncMethodCallback;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

import ch.epfl.eagle.daemon.EagleConf;
import ch.epfl.eagle.daemon.nodemonitor.TaskScheduler.TaskSpec;
import ch.epfl.eagle.daemon.scheduler.TaskPlacer;
import ch.epfl.eagle.daemon.scheduler.UnconstrainedTaskPlacer;
import ch.epfl.eagle.daemon.util.Logging;
import ch.epfl.eagle.daemon.util.Network;
import ch.epfl.eagle.daemon.util.Resources;
import ch.epfl.eagle.daemon.util.ThriftClientPool;
import ch.epfl.eagle.thrift.InternalService.AsyncClient.receiveGossip_call;
import ch.epfl.eagle.thrift.InternalService.AsyncClient.sendTasksReservations_call;
import ch.epfl.eagle.thrift.SchedulerService;
import ch.epfl.eagle.thrift.SchedulerService.AsyncClient;
import ch.epfl.eagle.thrift.SchedulerService.AsyncClient.sendFrontendMessage_call;
import ch.epfl.eagle.thrift.InternalService;
import ch.epfl.eagle.thrift.TEnqueueTaskReservationsRequest;
import ch.epfl.eagle.thrift.TEnqueueTaskReservationsResponse;
import ch.epfl.eagle.thrift.TFullTaskId;
import ch.epfl.eagle.thrift.THostPort;

/**
 * A Node Monitor which is responsible for communicating with application
 * backends. This class is wrapped by multiple thrift servers, so it may be
 * concurrently accessed when handling multiple function calls simultaneously.
 */
public class NodeMonitor {
    private final static Logger LOG = Logger.getLogger(NodeMonitor.class);
    private final static Logger AUDIT_LOG = Logging.getAuditLogger(TaskScheduler.class);

    private static NodeMonitorState state;
    private HashMap<String, InetSocketAddress> appSockets = new HashMap<String, InetSocketAddress>();
    private HashMap<String, List<TFullTaskId>> appTasks = new HashMap<String, List<TFullTaskId>>();
    /** HAWK STEALING: Thrift client pool for communicating with node monitors */
    ThriftClientPool<InternalService.AsyncClient> nodeMonitorClientPool = new ThriftClientPool<InternalService.AsyncClient>(
            new ThriftClientPool.InternalServiceMakerFactory());
    private Boolean stealing;
    private int maxStealingAttempts;
    private int stealingAttempts;
    public double smallPartition = 0;
    public double bigPartition = 0;
    /** EAGLE PIGGYBACKING */
    private List<String> notExecutingLong;
    private double longStatusTimestamp;
    private boolean gossiping;

    // Map to scheduler socket address for each request id.
    private ConcurrentMap<String, InetSocketAddress> requestSchedulers = Maps.newConcurrentMap();
    private ThriftClientPool<SchedulerService.AsyncClient> schedulerClientPool = new ThriftClientPool<SchedulerService.AsyncClient>(
            new ThriftClientPool.SchedulerServiceMakerFactory());
    private TaskScheduler scheduler;
    private TaskLauncherService taskLauncherService;
    private String ipAddress;

    public void initialize(Configuration conf, int nodeMonitorInternalPort) throws UnknownHostException {
        String mode = conf.getString(EagleConf.DEPLOYMENT_MODE, "unspecified");
        stealing = conf.getBoolean(EagleConf.STEALING, EagleConf.DEFAULT_STEALING);
        maxStealingAttempts = conf.getInt(EagleConf.STEALING_ATTEMPTS, EagleConf.DEFAULT_STEALING_ATTEMPTS);
        smallPartition = conf.getInt(EagleConf.SMALL_PARTITION, EagleConf.DEFAULT_SMALL_PARTITION);
        bigPartition = conf.getInt(EagleConf.BIG_PARTITION, EagleConf.DEFAULT_BIG_PARTITION);
        gossiping = conf.getBoolean(EagleConf.GOSSIPING, EagleConf.DEFAULT_GOSSIPING);

        stealingAttempts = 0;
        LOG.info("STEALING set to : " + stealing);
        if (mode.equals("standalone")) {
            state = new StandaloneNodeMonitorState();
        } else if (mode.equals("configbased")) {
            state = new ConfigNodeMonitorState();
        } else {
            throw new RuntimeException("Unsupported deployment mode: " + mode);
        }
        try {
            state.initialize(conf);
        } catch (IOException e) {
            LOG.fatal("Error initializing node monitor state.", e);
        }

        longStatusTimestamp = -1;
        // At the beginning all nodes will be free from Long jobs
        notExecutingLong = new ArrayList<String>();
        List<InetSocketAddress> nodeList = Lists.newArrayList(state.getNodeMonitors());
        // TODO EAGLE add itself to the list
        for (InetSocketAddress isa : nodeList)
            notExecutingLong.add(isa.toString());

        int mem = Resources.getSystemMemoryMb(conf);
        LOG.info("Using memory allocation: " + mem);

        ipAddress = Network.getIPAddress(conf);

        int cores = Resources.getSystemCPUCount(conf);
        LOG.info("Using core allocation: " + cores);

        String task_scheduler_type = conf.getString(EagleConf.NM_TASK_SCHEDULER_TYPE, "fifo");
        LOG.info("Task scheduler type: " + task_scheduler_type);
        if (task_scheduler_type.equals("round_robin")) {
            scheduler = new RoundRobinTaskScheduler(cores);
        } else if (task_scheduler_type.equals("fifo")) {
            scheduler = new FifoTaskScheduler(cores, this);
        } else if (task_scheduler_type.equals("priority")) {
            scheduler = new PriorityTaskScheduler(cores);
        } else {
            throw new RuntimeException("Unsupported task scheduler type: " + mode);
        }
        scheduler.initialize(conf, nodeMonitorInternalPort);
        taskLauncherService = new TaskLauncherService();
        taskLauncherService.initialize(conf, scheduler, nodeMonitorInternalPort);
    }

    /**
     * Registers the backend with assumed 0 load, and returns true if successful.
     * Returns false if the backend was already registered.
     */
    public boolean registerBackend(String appId, InetSocketAddress nmAddr, InetSocketAddress backendAddr) {
        LOG.info("Registering app " + appId);
        LOG.info(Logging.functionCall(appId, nmAddr, backendAddr));
        if (appSockets.containsKey(appId)) {
            LOG.info("Attempt to re-register app " + appId);
            return false;
        }
        appSockets.put(appId, backendAddr);
        appTasks.put(appId, new ArrayList<TFullTaskId>());
        return state.registerBackend(appId, nmAddr);
    }

    /**
     * Account for tasks which have finished.
     */
    public void tasksFinished(List<TFullTaskId> tasks) {
        LOG.debug(System.currentTimeMillis() + " " + Logging.functionCall(tasks));
        scheduler.tasksFinished(tasks);
    }

    /* ********************************************************
     * ********************** HAWK STEALING *******************
     * *******************************************************
     */
    // II. HAWK Method for asking probes
    private List<InetSocketAddress> getCleanWorkersList() {
        Set<InetSocketAddress> backends = state.getNodeMonitors();
        List<InetSocketAddress> listBackends = new ArrayList<InetSocketAddress>(backends);
        /*try {
           //String inetAddress = InetAddress.getLocalHost().getHostAddress();
           // Take out this monitor from backends
           int i = 0;
           for (InetSocketAddress backend : backends) {
        if (backend.getAddress().getHostAddress().equals(ipAddress)) //|| backend.getAddress().getHostAddress().equals(inetAddress))
           break;            
        i++;
           }
           //LOG.debug("STEALING: Going to search "+ ipAddress + " or "+ inetAddress + " in " + listBackends+ " index "+ i);
           //assert(i<listBackends.size());
           listBackends.remove(i);
        } catch (UnknownHostException e1) {
           e1.printStackTrace();
        }*/
        // Order by id
        Collections.sort(listBackends, new Comparator<InetSocketAddress>() {
            public int compare(InetSocketAddress a1, InetSocketAddress a2) {
                return a1.toString().compareTo(a2.toString());
            }
        });
        return listBackends;
    }

    public void requestTaskReservations() {
        LOG.info(Logging.functionCall());
        // 1. call sendTaskReservations to n workers
        List<InetSocketAddress> listBackends = getCleanWorkersList();
        // Get the big partition
        LOG.debug("STEALING: Initial node list size: " + listBackends.size());
        int last_nodeID = (int) (listBackends.size() * bigPartition / 100);
        listBackends = listBackends.subList(0, last_nodeID);
        LOG.debug("STEALING: Using nodes from 0 to " + last_nodeID + ". List consists of : " + listBackends);

        LOG.debug("STEALING: New list of backends " + listBackends.toString());
        Collections.shuffle(listBackends);
        InetSocketAddress chosenBackend = listBackends.get(0);

        try {
            InternalService.AsyncClient client = nodeMonitorClientPool.borrowClient(chosenBackend);
            stealingAttempts++;
            LOG.debug("STEALING: Launching sendTasksReservations on node: " + chosenBackend + " stealing attempts"
                    + stealingAttempts);
            client.sendTasksReservations(new SendTaskReservationsCallback(chosenBackend, client));
            LOG.debug("STEALING: Finished launching sendTasksReservations on node: " + chosenBackend);
        } catch (Exception e) {
            LOG.error("Error enqueuing task on node " + chosenBackend.toString() + ":" + e);
        }

    }

    // III. HAWK Method for sending probes
    public List<TEnqueueTaskReservationsRequest> sendTaskReservations() {
        LOG.info(Logging.functionCall());
        // ask scheduler for task reservations
        List<TaskSpec> requestsList = scheduler.handleRequestTaskReservation();
        List<TEnqueueTaskReservationsRequest> reservationsList = new ArrayList<TEnqueueTaskReservationsRequest>();
        // Convert from taskspec list to TEnqueueTaskReservationsRequest
        for (TaskSpec request : requestsList) {
            THostPort schedulerAddress = Network.socketAddressToThrift(request.schedulerAddress);
            TEnqueueTaskReservationsRequest reservation = new TEnqueueTaskReservationsRequest(request.appId,
                    request.user, request.shortTask, request.requestId, schedulerAddress,
                    request.originalNodeMonitorAddress, 1);
            reservationsList.add(reservation);
        }
        return reservationsList;
    }

    /**
     * Callback for requestTaskReservations() that returns the client to the
     * client pool.
     */
    private class SendTaskReservationsCallback implements AsyncMethodCallback<sendTasksReservations_call> {
        private InetSocketAddress nodeMonitorSocket;
        private InternalService.AsyncClient client;

        public SendTaskReservationsCallback(InetSocketAddress socket, InternalService.AsyncClient client) {
            nodeMonitorSocket = socket;
            this.client = client;
        }

        public void onError(Exception exception) {
            try {
                nodeMonitorClientPool.returnClient(nodeMonitorSocket, client);
            } catch (Exception e) {
                LOG.error(e);
            }
            LOG.error("Error executing enqueueTaskReservation RPC:" + exception);
        }

        @Override
        public void onComplete(sendTasksReservations_call response) {
            try {
                List<TEnqueueTaskReservationsRequest> taskReservationRequests = response.getResult();
                // TODO check for null
                LOG.debug("STEALING: SendTaskReservationsCallback, answer got size "
                        + taskReservationRequests.size());
                // 2. enqueue task reservations
                for (TEnqueueTaskReservationsRequest taskReservationRequest : taskReservationRequests) {
                    LOG.debug("STEALING: enqueuing taskReservationRequest id "
                            + taskReservationRequest.getRequestId() + " originalNodeMonitorAddress "
                            + taskReservationRequest.originalNodeMonitorAddress);
                    enqueueTaskReservations(taskReservationRequest);
                }
                nodeMonitorClientPool.returnClient(nodeMonitorSocket, client);
                // IV. Stealing policy Retry
                if (stealingAttempts < maxStealingAttempts && (taskReservationRequests.size() == 0)) {
                    LOG.debug("STEALING: Got 0 probes. stealingAttempts" + stealingAttempts
                            + " maxStealingAttempts " + maxStealingAttempts);
                    requestTaskReservations();
                }
            } catch (Exception e) {
                LOG.error(e);
            }
        }
    }

    /* ********************************************************
     * ********************** EAGLE GOSSIPING ****************
     * *******************************************************
     */
    // II. Gossip
    private void gossipNotExecutingLong(int round) {
        List<InetSocketAddress> listBackends = getCleanWorkersList();
        // Choose randomly log(n) workers
        Collections.shuffle(listBackends);
        int gossip_fanout = (int) (Math.ceil(Math.log(listBackends.size())));
        for (int i = 0; i < gossip_fanout; i++) {
            InetSocketAddress chosenBackend = listBackends.get(i);
            try {
                InternalService.AsyncClient client = nodeMonitorClientPool.borrowClient(chosenBackend);
                LOG.debug(
                        "STEALING: Launching gossipNotExecutingLong on node: " + chosenBackend + " round " + round);
                client.receiveGossip(notExecutingLong, longStatusTimestamp, round,
                        new ReceiveGossipCallback(chosenBackend, client));
            } catch (Exception e) {
                LOG.error("Error enqueuing task on node " + chosenBackend.toString() + ":" + e);
            }
        }
    }

    /**
     * Callback for receiveGossip() that returns the client to the
     * client pool.
     */
    private class ReceiveGossipCallback implements AsyncMethodCallback<receiveGossip_call> {
        private InetSocketAddress nodeMonitorSocket;
        private InternalService.AsyncClient client;

        public ReceiveGossipCallback(InetSocketAddress socket, InternalService.AsyncClient client) {
            nodeMonitorSocket = socket;
            this.client = client;
        }

        public void onError(Exception exception) {
            try {
                nodeMonitorClientPool.returnClient(nodeMonitorSocket, client);
            } catch (Exception e) {
                LOG.error(e);
            }
            LOG.error("Error executing enqueueTaskReservation RPC:" + exception);
        }

        @Override
        public void onComplete(receiveGossip_call response) {
            try {
                LOG.debug("Gossip completed ");
                nodeMonitorClientPool.returnClient(nodeMonitorSocket, client);
            } catch (Exception e) {
                LOG.error(e);
            }
        }
    }

    // III. Receive gossip
    public void receiveGossip(List<String> notExecutingLong, double longStatusTimestamp, int round) {
        if (this.longStatusTimestamp < longStatusTimestamp) {
            this.longStatusTimestamp = longStatusTimestamp;
            this.notExecutingLong = notExecutingLong;
            //TODO CHANGE round to conf or something
            int max_rounds = 3;
            round = round++;
            if (round <= max_rounds) {
                gossipNotExecutingLong(round);
            }
        }
    }

    /* *******************************************************
     * EAGLE : piggybacking nodes NOT executing long together with probes
     * ******************************************************
     */
    // I. EAGLE Getting long status
    public boolean enqueueTasksCentralized(TEnqueueTaskReservationsRequest request, List<String> notExecutingLong,
            double longStatusTimestamp) {
        assert (notExecutingLong.size() > 0);
        LOG.info("EAGLE Received enqueue task from CENTRALIZED " + ipAddress + " longStatusTimestamp "
                + longStatusTimestamp + ", count not executing long " + notExecutingLong.size());
        this.longStatusTimestamp = longStatusTimestamp;
        this.notExecutingLong = notExecutingLong;
        if (gossiping)
            gossipNotExecutingLong(1);
        return enqueueTaskReservations(request);
    }

    public TEnqueueTaskReservationsResponse probeEnqueueTaskReservations(TEnqueueTaskReservationsRequest request) {
        boolean response = enqueueTaskReservations(request);
        LOG.debug("EAGLE probeEnqueueTaskReservations executing long: " + scheduler.getLongQueuedExecuting()
                + " longStatusTimestamp " + longStatusTimestamp + " notExecutingLong " + notExecutingLong);

        return new TEnqueueTaskReservationsResponse(response, scheduler.getLongQueuedExecuting(), notExecutingLong,
                longStatusTimestamp);
    }

    public boolean enqueueTaskReservations(TEnqueueTaskReservationsRequest request) {
        LOG.debug(Logging.functionCall(request));
        stealingAttempts = 0;
        AUDIT_LOG.info(
                Logging.auditEventString("node_monitor_enqueue_task_reservation", ipAddress, request.requestId));
        LOG.info("Received enqueue task reservation request from " + ipAddress + " for request " + request.requestId
                + " shortTask " + request.shortTask);

        InetSocketAddress schedulerAddress = new InetSocketAddress(request.getSchedulerAddress().getHost(),
                request.getSchedulerAddress().getPort());
        requestSchedulers.put(request.getRequestId(), schedulerAddress);

        InetSocketAddress socket = appSockets.get(request.getAppId());
        if (socket == null) {
            LOG.error(
                    "No socket stored for " + request.getAppId() + " (never registered?). " + "Can't launch task.");
            return false;
        }
        scheduler.submitTaskReservations(request, socket);
        return true;
    }

    public void cancelTaskReservations(String requestId) {
        int numReservationsCancelled = scheduler.cancelTaskReservations(requestId);
        AUDIT_LOG.debug(Logging.auditEventString("node_monitor_cancellation", ipAddress, requestId,
                numReservationsCancelled));
    }

    private class sendFrontendMessageCallback implements AsyncMethodCallback<sendFrontendMessage_call> {
        private InetSocketAddress frontendSocket;
        private AsyncClient client;

        public sendFrontendMessageCallback(InetSocketAddress socket, AsyncClient client) {
            frontendSocket = socket;
            this.client = client;
        }

        public void onComplete(sendFrontendMessage_call response) {
            try {
                schedulerClientPool.returnClient(frontendSocket, client);
            } catch (Exception e) {
                LOG.error(e);
            }
        }

        public void onError(Exception exception) {
            try {
                schedulerClientPool.returnClient(frontendSocket, client);
            } catch (Exception e) {
                LOG.error(e);
            }
            LOG.error(exception);
        }
    }

    public void sendFrontendMessage(String app, TFullTaskId taskId, int status, ByteBuffer message) {
        LOG.debug(Logging.functionCall(app, taskId, message));
        InetSocketAddress scheduler = requestSchedulers.get(taskId.requestId);
        if (scheduler == null) {
            LOG.error("Did not find any scheduler info for request: " + taskId);
            return;
        }

        try {
            AsyncClient client = schedulerClientPool.borrowClient(scheduler);
            client.sendFrontendMessage(app, taskId, status, message,
                    new sendFrontendMessageCallback(scheduler, client));
            LOG.debug("finished sending message");
        } catch (IOException e) {
            LOG.error(e);
        } catch (TException e) {
            LOG.error(e);
        } catch (Exception e) {
            LOG.error(e);
        }
    }

    public Boolean getStealing() {
        return stealing;
    }

    // Only for testing
    public void setStealing(Boolean stealing) {
        this.stealing = stealing;
    }

}