io.github.retz.scheduler.RetzScheduler.java Source code

Introduction

Here is the source code for io.github.retz.scheduler.RetzScheduler.java
Source

/**
 *    Retz
 *    Copyright (C) 2016 Nautilus Technologies, Inc.
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 */
package io.github.retz.scheduler;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.datatype.jdk8.Jdk8Module;
import io.github.retz.cli.TimestampHelper;
import io.github.retz.db.Database;
import io.github.retz.protocol.StatusResponse;
import io.github.retz.protocol.data.Job;
import io.github.retz.protocol.exception.JobNotFoundException;
import org.apache.mesos.Protos;
import org.apache.mesos.Scheduler;
import org.apache.mesos.SchedulerDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.sql.SQLException;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;

public class RetzScheduler implements Scheduler {
    public static final String FRAMEWORK_NAME = "Retz-Framework";
    public static final String HTTP_SERVER_NAME;
    private static final Logger LOG = LoggerFactory.getLogger(RetzScheduler.class);

    static {
        // TODO: stop hard coding and get the file name in more generic way
        // COMMENT: I put the trick in build.gradle, saving the exact jar file name as resource bundle
        // REVIEW: http://www.eclipse.org/aether/ (not surveyed yet)
        // REVIEW: https://github.com/airlift/resolver (used in presto?)
        ResourceBundle labels = ResourceBundle.getBundle("retz-server");

        HTTP_SERVER_NAME = labels.getString("servername");
        LOG.info("Server name in HTTP(S) header: {}", HTTP_SERVER_NAME);
    }

    private final ResourceQuantity MAX_JOB_SIZE;
    private final ObjectMapper MAPPER = new ObjectMapper();
    private final Map<String, Protos.Offer> OFFER_STOCK = new ConcurrentHashMap<>();
    private final Planner PLANNER;
    private final Protos.Filters filters;
    private Launcher.Configuration conf;
    private Protos.FrameworkInfo frameworkInfo;
    private Map<String, List<Protos.SlaveID>> slaves;

    public RetzScheduler(Launcher.Configuration conf, Protos.FrameworkInfo frameworkInfo) {
        MAPPER.registerModule(new Jdk8Module());
        PLANNER = PlannerFactory.create(conf.getServerConfig().getPlannerName());
        this.conf = Objects.requireNonNull(conf);
        this.frameworkInfo = frameworkInfo;
        this.slaves = new ConcurrentHashMap<>();
        this.filters = Protos.Filters.newBuilder().setRefuseSeconds(conf.getServerConfig().getRefuseSeconds())
                .build();
        MAX_JOB_SIZE = conf.getServerConfig().getMaxJobSize();
    }

    public void stopAllExecutors(SchedulerDriver driver, String appName) {
        List<Protos.SlaveID> slaves = this.slaves.get(appName);
        Protos.ExecutorID executorID = Protos.ExecutorID.newBuilder().setValue(appName).build();
        byte[] msg = { 's', 't', 'o', 'p' };
        if (slaves != null) {
            for (Protos.SlaveID slave : slaves) {
                driver.sendFrameworkMessage(executorID, slave, msg);
            }
        }
    }

    @Override
    public void disconnected(SchedulerDriver driver) {
        LOG.warn("Disconnected from cluster");
    }

    @Override
    public void error(SchedulerDriver driver, String message) {
        LOG.error(message);
        // 'Framework has been removed' comes here;
    }

    @Override
    public void frameworkMessage(SchedulerDriver driver, Protos.ExecutorID executorId, Protos.SlaveID slaveId,
            byte[] data) {
        LOG.info("Framework Message ({} bytes)", data.length);
    }

    // There is a potential race between offerRescinded and using offer stocks;
    // in case handleAll trying to schedule tasks, offers are removed from OFFER_STOCK
    // but being used to schedule tasks - this message can't be in time ...
    // The task with rescinded offer (slave) might fail in advance; TASK_FAILED or TASK_LOST?
    // any way in this case it should be retried...
    @Override
    public void offerRescinded(SchedulerDriver driver, Protos.OfferID offerId) {
        LOG.info("Offer rescinded: {}", offerId.getValue());
        OFFER_STOCK.remove(offerId.getValue());
    }

    @Override
    public void registered(SchedulerDriver driver, Protos.FrameworkID frameworkId, Protos.MasterInfo masterInfo) {
        LOG.info("Connected to master {}; Framework ID: {}", masterInfo.getHostname(), frameworkId.getValue());
        frameworkInfo = frameworkInfo.toBuilder().setId(frameworkId).build();

        Optional<String> oldFrameworkId = Database.getInstance().getFrameworkId();
        if (oldFrameworkId.isPresent()) {
            if (oldFrameworkId.get().equals(frameworkId.getValue())) {
                // framework exists. nothing to do
                LOG.info("Framework id={} existed in past. Recovering any running jobs...", frameworkId.getValue());
                maybeRecoverRunning(driver);
            } else {
                LOG.error("Old different framework ({}) exists (!= {}). Quitting", oldFrameworkId.get(),
                        frameworkId.getValue());
                driver.stop();
            }
        } else {
            if (Database.getInstance().setFrameworkId(frameworkId.getValue())) {
            } else {
                LOG.warn("Failed to remember frameworkID...");
            }
        }
    }

    @Override
    public void reregistered(SchedulerDriver driver, Protos.MasterInfo masterInfo) {
        LOG.info("Reconnected to master {}", masterInfo.getHostname());
        // Maybe long time split brain, recovering all states from master required.
        maybeRecoverRunning(driver);
    }

    @Override
    public void resourceOffers(SchedulerDriver driver, List<Protos.Offer> offers) {
        LOG.debug("Resource offer: {}", offers.size());

        // Merge fresh offers from Mesos and offers in stock here, declining duplicate offers
        Stanchion.schedule(() -> {
            List<Protos.Offer> available = new LinkedList<>();
            synchronized (OFFER_STOCK) {
                // TODO: cleanup this code, optimize for max.stock = 0 case
                Map<String, List<Protos.Offer>> allOffers = new HashMap<>();
                for (Protos.Offer offer : OFFER_STOCK.values()) {
                    String key = offer.getSlaveId().getValue();
                    List<Protos.Offer> list = allOffers.getOrDefault(key, new LinkedList<>());
                    list.add(offer);
                    allOffers.put(offer.getSlaveId().getValue(), list);
                }
                for (Protos.Offer offer : offers) {
                    String key = offer.getSlaveId().getValue();
                    List<Protos.Offer> list = allOffers.getOrDefault(key, new LinkedList<>());
                    list.add(offer);
                    allOffers.put(offer.getSlaveId().getValue(), list);
                }

                int declined = 0;
                for (Map.Entry<String, List<Protos.Offer>> e : allOffers.entrySet()) {
                    if (e.getValue().size() == 1) {
                        available.add(e.getValue().get(0));
                    } else {
                        for (Protos.Offer dup : e.getValue()) {
                            driver.declineOffer(dup.getId(), filters);
                            declined += 1;
                        }
                    }
                }
                if (conf.fileConfig.getMaxStockSize() > 0) {
                    LOG.info("Offer stock renewal: {} offers available ({} declined from stock)", available.size(),
                            declined);
                }
                OFFER_STOCK.clear();
            }

            ResourceQuantity total = new ResourceQuantity();
            for (Protos.Offer offer : available) {
                LOG.debug("offer: {}", offer);
                Resource resource = ResourceConstructor.decode(offer.getResourcesList());
                total.add(resource);
            }

            // TODO: change findFit to consider not only CPU and Memory, but GPUs and Ports
            List<Job> jobs = JobQueue.findFit(PLANNER.orderBy(), total);
            handleAll(available, jobs, driver);
            // As this section is whole serialized by Stanchion, it is safe to do fetching jobs
            // from database and updating database state change from queued => starting at
            // separate transactions
        });
    }

    public void maybeInvokeNow(SchedulerDriver driver, Job job) {
        Stanchion.schedule(() -> {
            try {
                List<Job> queued = JobQueue.queued(1);
                if (queued.size() == 1 && queued.get(0).id() == job.id()) {
                    // OK
                } else {
                    return;
                }
            } catch (Exception e) {
                LOG.error("maybeInvokeNow failed: {}", e.toString());
                return;
            }

            List<Protos.Offer> available = new LinkedList<>();
            synchronized (OFFER_STOCK) {
                available.addAll(OFFER_STOCK.values());
                OFFER_STOCK.clear();
            }
            // Only if the queue is empty, and with offer stock, try job invocation
            List<Job> jobs = Arrays.asList(job);
            handleAll(available, jobs, driver);
        });
    }

    public void handleAll(List<Protos.Offer> offers, List<Job> jobs, SchedulerDriver driver) {

        // TODO: this is fleaky limitation, build this into Planner.plan as a constraint
        // Check if simultaneous jobs exceeded its limit
        int running = JobQueue.countRunning();
        if (running >= conf.fileConfig.getMaxSimultaneousJobs()) {
            LOG.warn("Number of concurrently running jobs has reached its limit: {} >= {} ({})", running,
                    conf.fileConfig.getMaxSimultaneousJobs(), ServerConfiguration.MAX_SIMULTANEOUS_JOBS);
            return;
        }

        // DO MAKE PLANNING
        List<Job> cancel = new LinkedList<>();
        List<AppJobPair> appJobPairs = PLANNER.filter(jobs, cancel, conf.getServerConfig().useGPU());
        // update database to change all jobs state to KILLED
        JobQueue.cancelAll(cancel);

        Plan bestPlan = PLANNER.plan(offers, appJobPairs, conf.getServerConfig().getMaxStockSize());

        int declined = 0;
        // Accept offers from mesos
        for (OfferAcceptor acceptor : bestPlan.getOfferAcceptors()) {
            if (acceptor.getJobs().isEmpty()) {
                declined += acceptor.declineOffer(driver, filters);
            } else {
                for (Job j : acceptor.getJobs()) {
                    // Update local database, to running
                    JobQueue.starting(j, Optional.empty(), j.taskId());
                }
                acceptor.acceptOffers(driver, filters);
            }
        }
        for (Protos.Offer offer : bestPlan.getToStock()) {
            OFFER_STOCK.put(offer.getSlaveId().getValue(), offer);
        }
        LOG.info(
                "{} accepted, {} declined ({} offers back in stock)", bestPlan.getOfferAcceptors().stream()
                        .mapToInt(offerAcceptor -> offerAcceptor.getJobs().size()).sum(),
                declined, bestPlan.getToStock().size());
    }

    @Override
    public void executorLost(SchedulerDriver driver, Protos.ExecutorID executorId, Protos.SlaveID slaveId,
            int status) {
        LOG.info("Executor {} of slave {}  stopped: {}", executorId.getValue(), slaveId.getValue(), status);

        // TODO: do we really need to manage slaves?
        List<Protos.SlaveID> slaves = this.slaves.get(executorId.getValue());
        if (slaves != null) {
            slaves.remove(slaveId);
            this.slaves.put(executorId.getValue(), slaves);
        }
    }

    // @doc Re-schedule **all** running job when a Slave is lost. I know it's a kludge.
    @Override
    public void slaveLost(SchedulerDriver driver, Protos.SlaveID slaveId) {
        LOG.warn("Slave lost: {}", slaveId.getValue());
        for (Map.Entry<String, List<Protos.SlaveID>> entry : slaves.entrySet()) {
            List<Protos.SlaveID> list = entry.getValue();
            for (Protos.SlaveID s : list) {
                if (s.getValue().equals(slaveId.getValue())) {
                    list.remove(s);
                }
            }
            slaves.put(entry.getKey(), list);
        }

        // TODO: remove **ONLY** tasks that is running on the failed slave
        Stanchion.schedule(() -> maybeRecoverRunning(driver));
    }

    @Override
    public void statusUpdate(SchedulerDriver driver, Protos.TaskStatus status) {
        LOG.info("Status update of task {}: {} / {}", status.getTaskId().getValue(), status.getState().name(),
                status.getMessage());

        Stanchion.schedule(() -> {
            switch (status.getState().getNumber()) {
            case Protos.TaskState.TASK_FINISHED_VALUE: {
                finished(status);
                break;
            }
            case Protos.TaskState.TASK_ERROR_VALUE:
            case Protos.TaskState.TASK_FAILED_VALUE:
            case Protos.TaskState.TASK_KILLED_VALUE: {
                failed(status);
                break;
            }
            case Protos.TaskState.TASK_LOST_VALUE: {
                retry(status);
                break;
            }
            case Protos.TaskState.TASK_KILLING_VALUE:
                break;
            case Protos.TaskState.TASK_RUNNING_VALUE:
                started(status);
                break;
            case Protos.TaskState.TASK_STAGING_VALUE:
                break;
            case Protos.TaskState.TASK_STARTING_VALUE:
                LOG.debug("Task {} starting", status.getTaskId().getValue());
                Optional<Job> job = JobQueue.getFromTaskId(status.getTaskId().getValue());
                if (job.isPresent()) {
                    JobQueue.starting(job.get(),
                            MesosHTTPFetcher.sandboxBaseUri(conf.getMesosMaster(), status.getSlaveId().getValue(),
                                    frameworkInfo.getId().getValue(), status.getExecutorId().getValue()),
                            status.getTaskId().getValue());
                }
                break;
            default:
                break;
            }
        });
    }

    // Maybe Retry
    void retry(Protos.TaskStatus status) {
        String reason = "";
        if (status.hasMessage()) {
            reason = status.getMessage();
        }
        try {
            JobQueue.retry(status.getTaskId().getValue(), reason);
        } catch (SQLException e) {
            LOG.error(e.toString(), e);
        } catch (JobNotFoundException e) {
            LOG.warn(e.toString(), e);
            // TODO: re-insert the failed job again?
        }
    }

    void finished(Protos.TaskStatus status) {
        Optional<String> maybeUrl = MesosHTTPFetcher.sandboxBaseUri(conf.getMesosMaster(),
                status.getSlaveId().getValue(), frameworkInfo.getId().getValue(),
                status.getExecutorId().getValue());

        int ret = status.getState().getNumber() - Protos.TaskState.TASK_FINISHED_VALUE;
        String finished = TimestampHelper.now();
        try {
            JobQueue.finished(status.getTaskId().getValue(), maybeUrl, ret, finished);
        } catch (SQLException e) {
            LOG.error(e.toString(), e);
        } catch (JobNotFoundException e) {
            LOG.warn(e.toString(), e);
            // TODO: re-insert the failed job again?
        }

    }

    void failed(Protos.TaskStatus status) {
        Optional<String> maybeUrl = MesosHTTPFetcher.sandboxBaseUri(conf.getMesosMaster(),
                status.getSlaveId().getValue(), frameworkInfo.getId().getValue(),
                status.getExecutorId().getValue());
        try {
            JobQueue.failed(status.getTaskId().getValue(), maybeUrl, status.getMessage());
        } catch (SQLException e) {
            LOG.error(e.toString(), e);
        } catch (JobNotFoundException e) {
            LOG.warn(e.toString(), e);
            // TODO: re-insert the failed job again?
        }
    }

    void started(Protos.TaskStatus status) {
        Optional<String> maybeUrl = MesosHTTPFetcher.sandboxBaseUri(conf.getMesosMaster(),
                status.getSlaveId().getValue(), frameworkInfo.getId().getValue(),
                status.getExecutorId().getValue());
        try {
            JobQueue.started(status.getTaskId().getValue(), maybeUrl);
        } catch (SQLException e) {
            LOG.error(e.toString(), e);
        } catch (JobNotFoundException e) {
            LOG.warn(e.toString(), e);
            // TODO: re-insert the failed job again?
        } catch (IOException e) {
            LOG.error(e.toString(), e);
        }
    }

    public void setOfferStats(StatusResponse statusResponse) {
        int totalCpu = 0;
        int totalMem = 0;
        int totalGpu = 0;
        for (Map.Entry<String, Protos.Offer> e : OFFER_STOCK.entrySet()) {
            Resource r = ResourceConstructor.decode(e.getValue().getResourcesList());
            totalCpu += r.cpu();
            totalMem += r.memMB();
            totalGpu += r.gpu();
        }
        // TODO: use ResourceQuantity instead of Resource
        statusResponse.setOfferStats(OFFER_STOCK.size(), totalCpu, totalMem, totalGpu);
    }

    // Get all running jobs and sync its latest state in Mesos
    // If it's not lost, just update state. Otherwise, set its state as QUEUED back.
    // This call must be offloaded from scheduler callback thread if schedule is active;
    // while if it's not active, it must block all other operations.
    private void maybeRecoverRunning(SchedulerDriver driver) {
        List<Job> jobs = Database.getInstance().getRunning();
        Database.getInstance().retryJobs(jobs.stream().map(job -> job.id()).collect(Collectors.toList()));
    }

    public boolean validateJob(Job job) {
        return MAX_JOB_SIZE.fits(job);
    }

    public ResourceQuantity maxJobSize() {
        return MAX_JOB_SIZE;
    }
}