org.apache.oodt.cas.resource.scheduler.ResourceMesosScheduler.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.oodt.cas.resource.scheduler.ResourceMesosScheduler.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.oodt.cas.resource.scheduler;

import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.lang.NotImplementedException;
import org.apache.mesos.Protos.ExecutorID;
import org.apache.mesos.Protos.ExecutorInfo;
import org.apache.mesos.Protos.FrameworkID;
import org.apache.mesos.Protos.MasterInfo;
import org.apache.mesos.Protos.Offer;
import org.apache.mesos.Protos.OfferID;
import org.apache.mesos.Protos.Resource;
import org.apache.mesos.Protos.SlaveID;
import org.apache.mesos.Protos.Status;
import org.apache.mesos.Protos.TaskID;
import org.apache.mesos.Protos.TaskInfo;
import org.apache.mesos.Protos.TaskStatus;
import org.apache.mesos.Protos.Value;
import org.apache.mesos.Scheduler;
import org.apache.mesos.SchedulerDriver;
import org.apache.oodt.cas.resource.batchmgr.Batchmgr;
import org.apache.oodt.cas.resource.batchmgr.MesosBatchManager;
import org.apache.oodt.cas.resource.jobqueue.JobQueue;
import org.apache.oodt.cas.resource.monitor.Monitor;
import org.apache.oodt.cas.resource.structs.JobSpec;
import org.apache.oodt.cas.resource.structs.ResourceNode;
import org.apache.oodt.cas.resource.structs.exceptions.JobQueueException;
import org.apache.oodt.cas.resource.structs.exceptions.MesosFrameworkException;
import org.apache.oodt.cas.resource.structs.exceptions.MonitorException;
import org.apache.oodt.cas.resource.structs.exceptions.SchedulerException;
import org.apache.oodt.cas.resource.util.MesosUtilities;

/**
 * @author starchmd
 * @version $Revision$
 *
 * A scheduler for part of the mesos frame work.
 */
public class ResourceMesosScheduler implements Scheduler, org.apache.oodt.cas.resource.scheduler.Scheduler {
    SchedulerDriver driver;
    MesosBatchManager batch;
    ExecutorInfo executor;
    JobQueue queue;
    Monitor mon;

    //Logger
    private static final Logger LOG = Logger.getLogger(ResourceMesosScheduler.class.getName());

    /**
     * Construct the scheduler
     * @param batch - batch manager (must be MesosBatchManager)
     * @param executor - Mesos ExecutorInfo
     * @param queue Job Queue used
     * @param mon - monitor used.
     */
    public ResourceMesosScheduler(MesosBatchManager batch, ExecutorInfo executor, JobQueue queue, Monitor mon) {
        this.batch = batch;
        this.executor = executor;
        this.queue = queue;
        this.mon = mon;
        LOG.log(Level.INFO, "Creating the resource-mesos scheduler.");
    }

    /* (non-Javadoc)
     * @see org.apache.mesos.Scheduler#disconnected(org.apache.mesos.SchedulerDriver)
     */
    @Override
    public void disconnected(SchedulerDriver schedDriver) {
        //TODO: Pause scheduler until master comes back online.

    }

    /* (non-Javadoc)
     * @see org.apache.mesos.Scheduler#error(org.apache.mesos.SchedulerDriver, java.lang.String)
     */
    @Override
    public void error(SchedulerDriver schedDriver, String error) {
        LOG.log(Level.SEVERE, "Mesos issued an error: " + error);
        //TODO: kill something here.
    }

    /* (non-Javadoc)
     * @see org.apache.mesos.Scheduler#executorLost(org.apache.mesos.SchedulerDriver, org.apache.mesos.Protos.ExecutorID, org.apache.mesos.Protos.SlaveID, int)
     */
    @Override
    public void executorLost(SchedulerDriver schedDriver, ExecutorID executor, SlaveID slave, int status) {
        //Tasks will have a "task lost" message automatically q.e.d no action necessary.
        //TODO: do we need to restart?
        LOG.log(Level.SEVERE, "Mesos executor " + executor + " on slave " + slave + " died with status " + status);
    }

    /* (non-Javadoc)
     * @see org.apache.mesos.Scheduler#frameworkMessage(org.apache.mesos.SchedulerDriver, org.apache.mesos.Protos.ExecutorID, org.apache.mesos.Protos.SlaveID, byte[])
     */
    @Override
    public void frameworkMessage(SchedulerDriver schedDriver, ExecutorID executor, SlaveID slave, byte[] bytes) {
        try {
            LOG.log(Level.INFO, "Mesos framework executor" + executor + " on slave " + slave + " issued message: "
                    + new String(bytes, "ascii"));
        } catch (UnsupportedEncodingException e) {
            LOG.log(Level.WARNING, "Mesos framework message missed due to bad encoding: ascii. " + e.getMessage());
        }
    }

    /* (non-Javadoc)
     * @see org.apache.mesos.Scheduler#offerRescinded(org.apache.mesos.SchedulerDriver, org.apache.mesos.Protos.OfferID)
     */
    @Override
    public void offerRescinded(SchedulerDriver schedDriver, OfferID offer) {
        //TODO: take away resources from batch manager...or stand in.
        //Unneeded?
    }

    /* (non-Javadoc)
     * @see org.apache.mesos.Scheduler#registered(org.apache.mesos.SchedulerDriver, org.apache.mesos.Protos.FrameworkID, org.apache.mesos.Protos.MasterInfo)
     */
    @Override
    public void registered(SchedulerDriver schedDriver, FrameworkID framework, MasterInfo masterInfo) {
        LOG.log(Level.INFO,
                "Mesos framework registered: " + framework.getValue() + " with master: " + masterInfo.getId());
    }

    /* (non-Javadoc)
     * @see org.apache.mesos.Scheduler#reregistered(org.apache.mesos.SchedulerDriver, org.apache.mesos.Protos.MasterInfo)
     */
    @Override
    public void reregistered(SchedulerDriver schedDriver, MasterInfo masterInfo) {
        LOG.log(Level.INFO, "Mesos framework re-registered with: " + masterInfo.getId());
        //TODO: call start, we are registered.

    }

    /* (non-Javadoc)
     * @see org.apache.mesos.Scheduler#resourceOffers(org.apache.mesos.SchedulerDriver, java.util.List)
     */
    @Override
    public void resourceOffers(SchedulerDriver driver, List<Offer> offers) {
        LOG.log(Level.INFO, "Offered mesos resources: " + offers.size() + " offers.");
        //Log, if possible the offers
        if (LOG.isLoggable(Level.FINER)) {
            for (Offer offer : offers) {
                try {
                    this.mon.addNode(new ResourceNode(offer.getSlaveId().getValue(),
                            new URL("http://" + offer.getHostname()), -1));
                } catch (MalformedURLException e) {
                    LOG.log(Level.WARNING, "Cannot add node to monitor (bad url).  Giving up: " + e.getMessage());
                } catch (MonitorException e) {
                    LOG.log(Level.WARNING, "Cannot add node to monitor (unkn).  Giving up: " + e.getMessage());
                }
                LOG.log(Level.FINER,
                        "Offer (" + offer.getId().getValue() + "): " + offer.getHostname() + "(Slave: "
                                + offer.getSlaveId().getValue() + ") "
                                + MesosUtilities.getResourceMessage(offer.getResourcesList()));
            }
        }
        List<JobSet> assignments = this.getJobAssignmentsJobs(offers);
        List<OfferID> used = new LinkedList<OfferID>();
        for (JobSet assignment : assignments) {
            //Launch tasks requires lists
            List<OfferID> ids = new LinkedList<OfferID>();
            List<TaskInfo> tasks = new LinkedList<TaskInfo>();
            tasks.add(assignment.task);
            used.add(assignment.offer.getId());
            ids.add(assignment.offer.getId());
            //Register locally and launch on mesos
            batch.registerExecutedJob(assignment.job.getJob().getId(), assignment.task.getTaskId());
            Status status = driver.launchTasks(ids, tasks); //Assumed one to one mapping
            if (status != Status.DRIVER_RUNNING)
                throw new MesosFrameworkException("Driver stopped: " + status.toString());
        }
        for (Offer offer : offers) {
            if (!used.contains(offer.getId())) {
                LOG.log(Level.INFO, "Rejecting Offer: " + offer.getId().getValue());
                driver.declineOffer(offer.getId());
            }
        }
    }

    /**
     * Builds a TaskInfo from the given jobspec
     * @param job - JobSpec to TaskInfo-ify
     * @param offer - offer add extra data (SlaveId)
     * @return TaskInfo fully formed
     */
    private TaskInfo getTaskInfo(JobSpec job, Offer offer) {
        TaskID taskId = TaskID.newBuilder().setValue(job.getJob().getId()).build();
        TaskInfo info = TaskInfo.newBuilder().setName("task " + taskId.getValue()).setTaskId(taskId)
                .setSlaveId(offer.getSlaveId())
                .addResources(Resource.newBuilder().setName("cpus").setType(Value.Type.SCALAR)
                        .setScalar(Value.Scalar.newBuilder().setValue(job.getJob().getLoadValue() * 1.0)))
                .addResources(Resource.newBuilder().setName("mem").setType(Value.Type.SCALAR)
                        .setScalar(Value.Scalar.newBuilder().setValue(job.getJob().getLoadValue() * 1024.0)))
                .setExecutor(ExecutorInfo.newBuilder(executor)).setData(MesosUtilities.jobSpecToByteString(job))
                .build();
        return info;
    }

    /**
     * Checks all offers against jobs in order, assigning jobs to offers until each offer is full,
     * or all jobs are gone.
     * @param offers - offers to assign jobs to.
     * @return List of <JobSpec,TaskInfo,Offer> tuples (assigned to each other).
     */
    private List<JobSet> getJobAssignmentsJobs(List<Offer> offers) {
        List<JobSet> list = new LinkedList<JobSet>();
        for (Offer offer : offers) {
            double cpus = 0.0, mem = 0.0;
            //Get the resources offered from this offer
            for (Resource resc : offer.getResourcesList()) {
                if (resc.getName().equals("cpus"))
                    cpus += resc.getScalar().getValue();
                if (resc.getName().equals("mem"))
                    mem += resc.getScalar().getValue();
            }
            //Search for enough jobs to fill the offer
            for (int i = 0; i < queue.getSize(); i++) {
                try {
                    JobSpec job = queue.getNextJob();
                    double load = job.getJob().getLoadValue();
                    //Check if enough resources
                    if (cpus < load || mem < load * 1024) {
                        queue.requeueJob(job);
                        continue;
                    }
                    cpus -= load;
                    mem -= 1024 * load;
                    JobSet tmp = new JobSet(job, getTaskInfo(job, offer), offer);
                    list.add(tmp);
                    //Not enough left, optimise and stop looking for jobs
                    if (cpus < 0.5 || mem <= 512.0)
                        break;
                } catch (JobQueueException e) {
                    throw new RuntimeException(e);
                }
            }
            //Optimization: break when no jobs
            if (queue.getSize() == 0)
                break;
        }
        return list;
    }

    /* (non-Javadoc)
     * @see org.apache.mesos.Scheduler#slaveLost(org.apache.mesos.SchedulerDriver, org.apache.mesos.Protos.SlaveID)
     */
    @Override
    public void slaveLost(SchedulerDriver schedDriver, SlaveID slave) {
        LOG.log(Level.WARNING, "Mesos slave " + slave + " lost, reissuing jobs.");
        //TODO: reregister jobs
    }

    /* (non-Javadoc)
     * @see org.apache.mesos.Scheduler#statusUpdate(org.apache.mesos.SchedulerDriver, org.apache.mesos.Protos.TaskStatus)
     */
    @Override
    public void statusUpdate(SchedulerDriver schedDriver, TaskStatus taskStatus) {
        //TODO: deliver messages, some rerun, some finish.
        LOG.log(Level.INFO, "Status update: " + taskStatus.getMessage());
    }

    @Override
    public void run() {
        LOG.log(Level.INFO, "Attempting to run framework. Nothing to do.");
        LOG.log(Level.FINEST, "Paradigm shift enabled.");
        LOG.log(Level.FINEST, "Spin and poll surplanted by event based execution.");
        LOG.log(Level.FINEST, "Mesos-OODT Fusion complete.");
        //Don't run anything
        return;
    }

    @Override
    public boolean schedule(JobSpec spec) throws SchedulerException {
        throw new NotImplementedException("Schedule is not called when using mesos.");
    }

    @Override
    public ResourceNode nodeAvailable(JobSpec spec) throws SchedulerException {
        return null;
    }

    @Override
    public Monitor getMonitor() {
        return mon;
    }

    @Override
    public Batchmgr getBatchmgr() {
        return batch;
    }

    @Override
    public JobQueue getJobQueue() {
        // TODO Auto-generated method stub
        return queue;
    }

    @Override
    public QueueManager getQueueManager() {
        // TODO Auto-generated method stub
        return null;
    }

    //Job set used internally to simplify data transmission
    private class JobSet {
        public JobSpec job;
        public TaskInfo task;
        public Offer offer;

        //Build a job set
        public JobSet(JobSpec job, TaskInfo task, Offer offer) {
            this.job = job;
            this.task = task;
            this.offer = offer;
        }
    }
}