fr.iphc.grid.jobmanager.JobManager.java Source code

Java tutorial

Introduction

Here is the source code for fr.iphc.grid.jobmanager.JobManager.java

Source

/*
Copyright CNRS - IPHC - LSMBO 2016
Authors: 
- Alexandre Burel: alexandre.burel@unistra.fr
- Patrick Guterl: Patrick.Guterl@unistra.fr
    
This software is a computer program whose purpose is to submit jobs on a 
computing grid using the JSAGA API. It is composed of two modules, a JobMonitor 
meant to run as a service to check periodically the state of each nodes on the 
grid, and a JobManager that will send the jobs on the best nodes at the moment 
of the submission.
    
This software is governed by the CeCILL license under French law and
abiding by the rules of distribution of free software.  You can  use, 
modify and/ or redistribute the software under the terms of the CeCILL
license as circulated by CEA, CNRS and INRIA at the following URL
"http://www.cecill.info". 
    
As a counterpart to the access to the source code and  rights to copy,
modify and redistribute granted by the license, users are provided only
with a limited warranty  and the software's author,  the holder of the
economic rights,  and the successive licensors  have only  limited
liability. 
    
In this respect, the user's attention is drawn to the risks associated
with loading,  using,  modifying and/or developing or reproducing the
software by the user in light of its specific status of free software,
that may mean  that it is complicated to manipulate,  and  that  also
therefore means  that it is reserved for developers  and  experienced
professionals having in-depth computer knowledge. Users are therefore
encouraged to load and test the software's suitability as regards their
requirements in conditions enabling the security of their systems and/or 
data to be ensured and,  more generally, to use and operate it in the 
same conditions as regards security. 
    
The fact that you are presently reading this means that you have had
knowledge of the CeCILL license and that you accept its terms.
*/

package fr.iphc.grid.jobmanager;

import java.io.File;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.concurrent.Executors;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.OptionGroup;
import org.apache.commons.cli.Options;
import org.ogf.saga.error.NoSuccessException;
import org.ogf.saga.job.JobDescription;
import org.ogf.saga.session.SessionFactory;

import fr.in2p3.jsaga.command.AbstractCommand;
import fr.iphc.grid.Global;

public class JobManager extends AbstractCommand {
    private static final String OPT_HELP = "h", LONGOPT_HELP = "help";
    // required arguments
    private static final String OPT_JOB = "f", LONGOPT_JOB = "job";
    private static final String OPT_FILEJOB = "l", LONGOPT_FILEJOB = "list";
    // optional arguments
    private static final String OPT_QUEUE = "q", LONGOPT_QUEUE = "queue";
    private static final String OPT_DESCRIPTION = "d", LONGOPT_DESCRIPTION = "description";
    private static final String OPT_JOBID = "i", LONGOPT_JOBID = "jobid";
    private static final String OPT_END = "e", LONGOPT_END = "end";
    private static final String OPT_RUN = "r", LONGOPT_RUN = "run";
    private static final String OPT_WAIT = "w", LONGOPT_WAIT = "wait";
    private static final String OPT_BAD = "b", LONGOPT_BAD = "bad";
    private static final String OPT_CWD = "c", LONGOPT_CWD = "cwd";
    private static final String OPT_LOGDISPLAY = "v", LONGOPT_LOGDISPLAY = "logd";
    private static final String OPT_OPTIMIZETIMEOUTRUN = "t", LONGOPT_OPTIMIZETIMEOUTRUN = "opto";
    private static final String OPT_SETUP = "s", LONGOPT_SETUP = "setup";

    protected JobManager() {
        super("jsaga-jobmanager", null, null, new GnuParser());
    }

    static void sleep(long sleep) { // in ms
        long startTime = System.currentTimeMillis();
        long endTime = startTime + sleep;
        while (System.currentTimeMillis() < endTime) {
        }
    }

    /**
     * @param args
     */
    public static void main(String[] args) throws Exception {
        JobManager command = new JobManager();
        CommandLine line = command.parse(args);
        ArrayList<File> JdlList = new ArrayList<File>();
        Global.getOutputexecutor = Executors.newFixedThreadPool(10);
        Initialize init = new Initialize();
        String SetupFile = "setup_vigrid.xml";

        if (line.hasOption(OPT_SETUP)) {
            SetupFile = line.getOptionValue(OPT_SETUP);
        }
        if ((new File(SetupFile).isFile())) {
            init.GlobalSetup(SetupFile);
        }
        // Init Job
        if (line.hasOption(OPT_JOB)) {
            File file = new File(line.getOptionValue(OPT_JOB));
            if ((file.isFile())) {
                JdlList.add(file);
            } else {
                System.err.println("The file " + file + " doesn't exist");
                System.exit(-1);
            }
        } else {
            File file = new File(line.getOptionValue(OPT_FILEJOB));
            if ((file.isFile())) {
                JdlList = init.InitJdl(file);
            } else {
                System.err.println("The file " + file + " doesn't exist");
                System.exit(-1);
            }
        }
        if (line.hasOption(OPT_WAIT)) {
            Global.TIMEOUTWAIT = Integer.parseInt(line.getOptionValue(OPT_WAIT));
        }
        if (line.hasOption(OPT_RUN)) {
            Global.TIMEOUTRUN = Integer.parseInt(line.getOptionValue(OPT_RUN));
        }
        if (line.hasOption(OPT_END)) {
            Global.TIMEOUTEND = Integer.parseInt(line.getOptionValue(OPT_END));
        }
        if (line.hasOption(OPT_LOGDISPLAY)) {
            Global.SEUILDISPLAYLOG = Float.parseFloat(line.getOptionValue(OPT_LOGDISPLAY));
        }
        init.InitJob(JdlList);
        // Init Url Ce
        if (line.hasOption(OPT_QUEUE)) {
            Global.file = new File(line.getOptionValue(OPT_QUEUE));
        }
        if (line.hasOption(OPT_BAD)) {
            Global.BadCe = new File(line.getOptionValue(OPT_BAD));
        }
        if (line.hasOption(OPT_OPTIMIZETIMEOUTRUN)) {
            Global.OPTTIMEOUTRUN = false;
        }
        if (line.hasOption(OPT_CWD)) {
            File theDir = new File(line.getOptionValue(OPT_CWD));
            if (!theDir.exists()) {
                if (!theDir.mkdirs()) {
                    System.err.println("Working directory create failed: " + line.getOptionValue(OPT_CWD));
                    System.exit(-1);
                }
            }
            Global.Cwd = line.getOptionValue(OPT_CWD);
        } else {
            Global.Cwd = System.getProperty("user.dir");
        }
        if (!(new File(Global.Cwd)).canWrite()) {
            System.err.println(" Write permission denied : " + Global.Cwd);
            System.exit(-1);
        }
        System.out.println("Current working directory : " + Global.Cwd);
        Date start = new Date();
        init.PrintGlobalSetup();
        init.InitUrl(Global.file);
        init.InitSosCe();
        init.rmLoadFailed(Global.Cwd + "/loadFailed.txt");
        System.out.println("CE: " + Global.ListUrl.size() + " Nb JOB: " + Global.ListJob.size() + " " + new Date());
        if (Global.ListJob.size() < 6) { // pour obtenir rapport de 0.8
            Global.OPTTIMEOUTRUN = false;
        }
        // check if we can connect to the grid
        try {
            SessionFactory.createSession(true);
        } catch (NoSuccessException e) {
            System.err.println("Could not connect to the grid at all (" + e.getMessage() + ")");
            System.err.println("Aborting");
            System.exit(0);

        }
        // Launch Tread Job
        JobThread st = new JobThread(Global.ListJob, Global.ListUrl);
        st.start();
        LoggingThread logst = new LoggingThread(Global.ListJob, Global.ListUrl, Global.SEUILDISPLAYLOG);
        logst.start();
        // create Thread Hook intercept kill +CNTL+C
        Thread hook = new Thread() {
            public void run() {
                try {
                    for (Jdl job : Global.ListJob) {
                        if (job.getJobId() != null) {
                            JobThread.jobCancel(job.getJobId());
                        }
                    }
                } catch (Exception e) {
                    System.err.println("Thread Hook:\n" + e.getMessage());
                }
                // give it a change to display final job state
                try {
                    sleep(5000);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
            }
        };
        Runtime.getRuntime().addShutdownHook(hook);

        //      Integer timer = 180 * 60 * 1000;
        Date now = new Date();

        //      Boolean Fin = false;
        while ((!Global.END) && ((now.getTime() - start.getTime()) < Global.TIMEOUTEND * 60 * 1000)) { // TOEND
            // en
            // minutes
            now = new Date();
            // int mb = 1024*1024;
            // Getting the runtime reference from system
            // Runtime runtime = Runtime.getRuntime();
            // System.out.println("##### Heap utilization statistics [MB]
            // #####");
            // Print used memory
            // System.out.println("Used Memory:"
            // + (runtime.totalMemory() - runtime.freeMemory()) / mb);

            // Print free memory
            // System.out.println("Free Memory:"
            // + runtime.freeMemory() / mb);

            // Print total available memory
            // System.out.println("Total Memory:" + runtime.totalMemory() / mb);

            // Print Maximum available memory
            // System.out.println("Max Memory:" + runtime.maxMemory() / mb);
            // // System.out.println("NB: "+nb_end);
            // if ((float)(runtime.totalMemory() -
            // runtime.freeMemory())/(float)runtime.maxMemory() > (float)0.3){
            // System.out.println ("GC: "+(float)(runtime.totalMemory() -
            // runtime.freeMemory())/runtime.maxMemory());
            // System.gc();
            // };
            sleep(15 * 1000); // in ms

            // System.gc();
            // Fin=true;
            // for (Jdl job : Global.ListJob) {
            // if (job.getJob() != null) {
            // System.out.println("JOB: "+job.getId()+"\t"+job.getStatus());
            // if (job.getStatus().compareTo("END")==0){
            // ((JobImpl) job.getJob()).postStagingAndCleanup();
            // System.out.println("END JOB: "+job.getId());
            // job.setStatus("END");
            // }
            // if (job.getStatus().compareTo("END")!=0){
            // Fin=false;
            // }
            // System.out.println("JOB: "+job.getId()+"\t"+job.getStatus() +
            // "\t"+job.getFail()+"\t"+job.getNodeCe());
            // }
            // }
            // while ((Global.END==0) && ((new
            // Date().getTime()-start.getTime())<timer)){
        }
        // Boolean end_load=false;
        // while (!end_load){
        // end_load=true;
        // for(Jdl job:Global.ListJob){
        // if (job.getStatus().equals("LOAD")){
        // end_load=false;
        // }
        // }
        // }
        System.out.println("END JOB: " + now);
        st.halt();
        logst.halt();
        Iterator<Url> k = Global.ListUrl.iterator();
        while (k.hasNext()) {
            Url url = k.next();
            System.out.println("URL: " + url.getUrl());
        }
        Iterator<Jdl> m = Global.ListJob.iterator();
        while (m.hasNext()) {
            Jdl job = m.next();
            System.out.println(
                    "JOB: " + job.getId() + "\t" + job.getFail() + "\t" + job.getStatus() + "\t" + job.getNodeCe());
        }
        System.out.println(start + " " + new Date());
        System.exit(0);
    }

    @SuppressWarnings("static-access")
    protected Options createOptions() {
        Options opt = new Options();

        // command
        opt.addOption(OptionBuilder.withDescription("Display this help and exit").withLongOpt(LONGOPT_HELP)
                .create(OPT_HELP));

        // optional arguments
        opt.addOption(OptionBuilder.withDescription("file with cream URL [IPHC]").hasArg()
                .withLongOpt(LONGOPT_QUEUE).withArgName("queue").create(OPT_QUEUE));

        opt.addOption(OptionBuilder
                .withDescription("generate the job description in the targeted grid language "
                        + "and exit (do not submit the job)")
                .withLongOpt(LONGOPT_DESCRIPTION).create(OPT_DESCRIPTION));

        opt.addOption(OptionBuilder.withDescription("File List of Bad Ce elminate from the Ce List[IPHC]").hasArg()
                .withLongOpt(LONGOPT_BAD).create(OPT_BAD));

        opt.addOption(OptionBuilder
                .withDescription("Time -> END Program in minutes/hour (Default 10H) Value: mM hH  [IPHC]").hasArg()
                .withLongOpt(LONGOPT_END).create(OPT_END));

        opt.addOption(OptionBuilder
                .withDescription("Duration Run -Thread in minutes/hour (Default 1H) Value: mM hH  [IPHC]").hasArg()
                .withLongOpt(LONGOPT_RUN).create(OPT_RUN));

        opt.addOption(
                OptionBuilder.withDescription("Time queue wating in CE/queue in minutes (Default 15 mn) [IPHC]")
                        .hasArg().withLongOpt(LONGOPT_WAIT).create(OPT_WAIT));

        opt.addOption(OptionBuilder.withDescription("Threshold display full log in log file[IPHC]").hasArg()
                .withLongOpt(LONGOPT_LOGDISPLAY).create(OPT_LOGDISPLAY));

        opt.addOption(
                OptionBuilder.withDescription("Define working directory for monitoring file (default cwd) [IPHC]")
                        .hasArg().withLongOpt(LONGOPT_CWD).create(OPT_CWD));

        opt.addOption(OptionBuilder.withDescription("Setup xml file Global paramters (default setup) [IPHC]")
                .hasArg().withLongOpt(LONGOPT_SETUP).create(OPT_SETUP));

        opt.addOption(OptionBuilder.withDescription("Disable Optimize Timeout Run Average execution time[IPHC]")
                .withLongOpt(LONGOPT_OPTIMIZETIMEOUTRUN).create(OPT_OPTIMIZETIMEOUTRUN));

        // required arguments group jdl file | list of jdl
        OptionGroup reqGroup = new OptionGroup();
        reqGroup.addOption(OptionBuilder.withDescription("read job description from file <path> [IPHC]").hasArg()
                .withLongOpt(LONGOPT_JOB).withArgName("file").create(OPT_JOB));
        reqGroup.addOption(OptionBuilder.withDescription("read file include jdl file [IPHC]").hasArg()
                .withLongOpt(LONGOPT_FILEJOB).withArgName("list").create(OPT_FILEJOB));
        reqGroup.setRequired(true);
        opt.addOptionGroup(reqGroup);

        // optional group
        OptionGroup optGroup = new OptionGroup();
        optGroup.addOption(OptionBuilder
                .withDescription(
                        "print the job identifier as soon as it is submitted, " + "and wait for it to be finished")
                .withLongOpt(LONGOPT_JOBID).create(OPT_JOBID));

        optGroup.setRequired(false);
        opt.addOptionGroup(optGroup);

        // job description
        opt.addOption(o("positional parameters for the command").hasArgs().create(JobDescription.ARGUMENTS));
        opt.addOption(o("SPMD job type and startup mechanism").hasArg().create(JobDescription.SPMDVARIATION));
        opt.addOption(
                o("total number of cpus requested for this job").hasArg().create(JobDescription.TOTALCPUCOUNT));
        opt.addOption(o("number of process instances to start").hasArg().create(JobDescription.NUMBEROFPROCESSES));
        opt.addOption(o("number of processes to start per host").hasArg().create(JobDescription.PROCESSESPERHOST));
        opt.addOption(
                o("expected number of threads per process").hasArg().create(JobDescription.THREADSPERPROCESS));
        opt.addOption(o("set of environment variables for the job").hasArgs().withValueSeparator()
                .create(JobDescription.ENVIRONMENT));
        opt.addOption(o("working directory for the job").hasArg().create(JobDescription.WORKINGDIRECTORY));
        opt.addOption(o("run the job in interactive mode").create(JobDescription.INTERACTIVE));
        opt.addOption(o("pathname of the standard input file").hasArg().create(JobDescription.INPUT));
        opt.addOption(o("pathname of the standard output file").hasArg().create(JobDescription.OUTPUT));
        opt.addOption(o("pathname of the standard error file").hasArg().create(JobDescription.ERROR));
        opt.addOption(o("a list of file transfer directives").hasArgs().create(JobDescription.FILETRANSFER));
        opt.addOption(o("defines if output files get removed after the job finishes").hasArg()
                .create(JobDescription.CLEANUP));
        opt.addOption(o("time at which a job should be scheduled").hasArg().create(JobDescription.JOBSTARTTIME));
        opt.addOption(o("hard limit for the total job runtime").hasArg().create(JobDescription.WALLTIMELIMIT));
        opt.addOption(o("estimated total number of CPU seconds which the job will require").hasArg()
                .create(JobDescription.TOTALCPUTIME));
        opt.addOption(o("estimated amount of memory the job requires").hasArg()
                .create(JobDescription.TOTALPHYSICALMEMORY));
        opt.addOption(o("compatible processor for job submission").hasArg().create(JobDescription.CPUARCHITECTURE));
        opt.addOption(o("compatible operating system for job submission").hasArg()
                .create(JobDescription.OPERATINGSYSTEMTYPE));
        opt.addOption(
                o("list of host names which are to be considered by the resource manager as candidate targets")
                        .hasArgs().create(JobDescription.CANDIDATEHOSTS));
        opt.addOption(o("name of a queue to place the job into").hasArg().create(JobDescription.QUEUE));
        opt.addOption(o("name of an account or project name").hasArg().create(JobDescription.JOBPROJECT));
        opt.addOption(o("set of endpoints describing where to report").hasArgs().create(JobDescription.JOBCONTACT));

        // returns
        return opt;
    }

    private static OptionBuilder o(String description) {
        return OptionBuilder.withDescription(description);
    }

}