io.github.retz.scheduler.Launcher.java Source code

Java tutorial

Introduction

Here is the source code for io.github.retz.scheduler.Launcher.java

Source

/**
 *    Retz
 *    Copyright (C) 2016-2017 Nautilus Technologies, Inc.
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 */
package io.github.retz.scheduler;

import com.j256.simplejmx.server.JmxServer;
import io.github.retz.db.Database;
import io.github.retz.mesosc.MesosHTTPFetcher;
import io.github.retz.protocol.data.Job;
import io.github.retz.web.StatusCache;
import io.github.retz.web.WebConsole;
import org.apache.commons.cli.*;
import org.apache.mesos.Protos;
import org.apache.mesos.SchedulerDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.management.*;
import java.io.IOException;
import java.lang.management.ManagementFactory;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.sql.SQLException;
import java.util.*;
import java.util.stream.Collectors;

public final class Launcher {
    static final Option OPT_CONFIG;
    static final Option OPT_MODE;
    private static final Logger LOG = LoggerFactory.getLogger(Launcher.class);
    private static final Options OPTIONS;

    static {
        OPT_CONFIG = new Option("C", "config", true, "Configuration file path");
        OPT_CONFIG.setArgName("/path/to/retz.properties");

        OPT_MODE = new Option("M", "mode", true, "Scheduler mode ( local|mesos )");
        OPT_MODE.setArgName("mesos");

        OPTIONS = new Options();
        OPTIONS.addOption(OPT_CONFIG);
        OPTIONS.addOption(OPT_MODE);
    }

    public static void main(String... argv) {
        System.exit(run(argv));
    }

    public static int run(String... argv) {

        Configuration conf;
        try {
            conf = parseConfiguration(argv);
            if (conf.fileConfig.isTLS()) {
                LOG.warn("Make sure a valid certificate is being used or RetzExecutor may not work.");
            }
            Database.getInstance().init(conf.getServerConfig());
            if (conf.getServerConfig().getGc()) {
                GarbageJobCollector.start(conf.getServerConfig().getGcLeeway(),
                        conf.getServerConfig().getGcInterval());
            } else {
                LOG.info("Automatic garbage collection is turned off; use retz-admin gc to collect old jobs");
            }
        } catch (ParseException e) {
            LOG.error(e.toString());
            return -1;
        } catch (URISyntaxException e) {
            LOG.error(e.toString());
            return -1;
        } catch (SQLException e) {
            LOG.error(e.toString());
            return -1;
        } catch (IOException e) {
            LOG.error(e.toString());
            return -1;
        }

        Optional<JmxServer> maybeJmxServer = AdminConsole.startJmxServer(conf.getServerConfig());
        if (!maybeJmxServer.isPresent()) {
            LOG.error("Failed to start JMX Server");
            return -1;
        }
        JmxServer jmxServer = maybeJmxServer.get();

        Protos.FrameworkInfo fw = buildFrameworkInfo(conf);

        // Retz must do all recovery process before launching scheduler;
        // This is because running scheduler changes state of any jobs if it
        // has successfully connected to Mesos master.
        // By hitting HTTP endpoints and comparing with database job states,
        // Retz can decide whether to re-run it or just finish it.
        // BTW after connecting to Mesos it looks like re-sending unacked messages.
        maybeRequeueRunningJobs(conf.getMesosMaster(), fw.getId().getValue(), Database.getInstance().getRunning());

        RetzScheduler scheduler;
        try {
            scheduler = new RetzScheduler(conf, fw);
        } catch (Throwable t) {
            LOG.error("Cannot initialize scheduler", t);
            return -1;
        }
        SchedulerDriver driver = SchedulerDriverFactory.create(scheduler, conf, fw);

        Protos.Status status = driver.start();

        if (status != Protos.Status.DRIVER_RUNNING) {
            LOG.error("Cannot start Mesos scheduler: {}", status.name());
            System.exit(-1);
            //} else if (status == Protos.Status.DRIVER_ABORTED) {
            //} else if (status == Protos.Status.DRIVER_NOT_STARTED) {
            //} else if (status == Protos.Status.DRIVER_STOPPED) {
        }

        LOG.info("Mesos scheduler started: {}", status.name());

        // Start web server
        WebConsole.start(conf.fileConfig);
        WebConsole.set(scheduler, driver);
        LOG.info("Web console has started with port {}", conf.getPort());

        java.lang.Runtime.getRuntime().addShutdownHook(new ShutdownThread(driver));

        // Stop them all, usually don't come here
        // Wait for Mesos framework stop
        status = driver.join();
        LOG.info("{} has been stopped: {}", RetzScheduler.FRAMEWORK_NAME, status.name());

        WebConsole.stop(); // Stop web server
        GarbageJobCollector.stop();
        Database.getInstance().stop();
        jmxServer.stop();

        return (status == Protos.Status.DRIVER_STOPPED ? 0 : 255);
    }

    private static void maybeRequeueRunningJobs(String master, String frameworkId, List<Job> running) {
        LOG.info("{} jobs found in DB 'STARTING' or 'STARTED' state. Requeuing...", running.size());
        int offset = 0;
        int limit = 128;
        Map<String, Job> runningMap = running.stream().collect(Collectors.toMap(job -> job.taskId(), job -> job));
        List<Job> recoveredJobs = new LinkedList<>();
        while (true) {
            try {
                List<Map<String, Object>> tasks = MesosHTTPFetcher.fetchTasks(master, frameworkId, offset, limit);
                if (tasks.isEmpty()) {
                    break;
                }

                for (Map<String, Object> task : tasks) {
                    String state = (String) task.get("state");
                    // Get TaskId
                    String taskId = (String) task.get("id");
                    if (runningMap.containsKey(taskId)) {
                        Job job = runningMap.remove(taskId);
                        recoveredJobs.add(JobQueue.updateJobStatus(job, state));
                    } else {
                        LOG.warn("Unknown job!");
                    }
                }
                offset = offset + tasks.size();
            } catch (MalformedURLException e) {
                LOG.error(e.toString());
                throw new RuntimeException(e.toString());
            }
        }
        Database.getInstance().updateJobs(recoveredJobs);
        LOG.info("{} jobs rescheduled, {} jobs didn't need change.", recoveredJobs.size(), runningMap.size());
    }

    private static Protos.FrameworkInfo buildFrameworkInfo(Configuration conf) {
        String userName = conf.fileConfig.getUserName();

        Protos.FrameworkInfo.Builder fwBuilder = Protos.FrameworkInfo.newBuilder().setUser(userName)
                .setName(RetzScheduler.FRAMEWORK_NAME).setWebuiUrl(conf.fileConfig.getUri().toASCIIString())
                .setFailoverTimeout(3600 * 24 * 7).setCheckpoint(true).setPrincipal(conf.fileConfig.getPrincipal())
                .setRole(conf.fileConfig.getRole());

        Optional<String> fid = Database.getInstance().getFrameworkId();
        if (fid.isPresent()) {
            LOG.info("FrameworkID {} found", fid.get());
            fwBuilder.setId(Protos.FrameworkID.newBuilder().setValue(fid.get()).build());
        }

        if (conf.fileConfig.useGPU()) {
            LOG.info("GPU enabled - registering with GPU_RESOURCES capability.");
            fwBuilder.addCapabilities(Protos.FrameworkInfo.Capability.newBuilder()
                    .setType(Protos.FrameworkInfo.Capability.Type.GPU_RESOURCES).build());
        }

        LOG.info("Connecting to Mesos master {} as {}", conf.getMesosMaster(), userName);
        return fwBuilder.build();
    }

    static Configuration parseConfiguration(String[] argv) throws ParseException, IOException, URISyntaxException {
        CommandLineParser parser = new DefaultParser();
        CommandLine cmd = parser.parse(OPTIONS, argv); //argumentList.getStandardAsArray());

        // This default path must match the prefix in build.gradle
        String configFile = cmd.getOptionValue(OPT_CONFIG.getOpt(), "/opt/retz-server/etc/retz.properties");

        Configuration conf = new Configuration(new ServerConfiguration(configFile));
        LOG.info("Binding as {}", conf.fileConfig.getUri()); // TODO hostname, protocol

        String mode = cmd.getOptionValue(OPT_MODE.getOpt(), "mesos");
        if ("local".equals(mode)) {
            conf.launchMode = Configuration.Mode.LOCAL;
            LOG.warn("Using local mode. This is for *TESTS*, don't use this in production");
        } else if ("mesos".equals(mode)) {
            conf.launchMode = Configuration.Mode.MESOS;
        }

        return conf;
    }

    public static final class Configuration {
        ServerConfiguration fileConfig;
        Mode launchMode;

        public Configuration(ServerConfiguration fileConfig) {
            Objects.requireNonNull(fileConfig, "File configuration cannot be null");
            Objects.requireNonNull(fileConfig.getMesosMaster(), "Mesos master location cannot be empty");

            this.fileConfig = fileConfig;
        }

        public int getPort() {
            return fileConfig.getUri().getPort();
        }

        public String getMesosMaster() {
            return fileConfig.getMesosMaster();
        }

        public ServerConfiguration getServerConfig() {
            return fileConfig;
        }

        enum Mode {
            LOCAL, MESOS
        }

    }
}