org.xlcloud.xsa.ext.hpc.service.impl.SlurmHpcJobsManager.java Source code

Java tutorial

Introduction

Here is the source code for org.xlcloud.xsa.ext.hpc.service.impl.SlurmHpcJobsManager.java

Source

/*
 * Copyright 2012 AMG.lab, a Bull Group Company
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *    http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.xlcloud.xsa.ext.hpc.service.impl;

import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import javax.inject.Inject;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.builder.ToStringBuilder;
import org.apache.commons.lang.builder.ToStringStyle;
import org.apache.log4j.Logger;
import org.xlcloud.config.ConfigParam;
import org.xlcloud.rest.exception.InternalErrorException;
import org.xlcloud.rest.exception.ObjectNotFoundException;
import org.xlcloud.rest.exception.ValidationException;
import org.xlcloud.xsa.Application;
import org.xlcloud.xsa.ExecParams;
import org.xlcloud.xsa.HpcJob;
import org.xlcloud.xsa.HpcJobSubmission;
import org.xlcloud.xsa.ext.hpc.service.ApplicationsManager;
import org.xlcloud.xsa.ext.hpc.service.HpcJobsManager;
import org.xlcloud.xsa.ext.hpc.service.parser.JobAccountingListOutputParser;
import org.xlcloud.xsa.ext.hpc.service.parser.JobAccountingOutputParser;
import org.xlcloud.xsa.ext.hpc.service.parser.JobDetailsOutputParser;
import org.xlcloud.xsa.ext.hpc.service.parser.JobSubmissionOutputParser;
import org.xlcloud.xsa.ext.hpc.service.parser.JobTerminationOutputParser;
import org.xlcloud.xsa.ext.hpc.service.parser.JobsListOutputParser;
import org.xlcloud.xsa.ext.hpc.service.process.ProcessExecutionResult;
import org.xlcloud.xsa.ext.hpc.service.process.ProcessExecutor;

import ch.lambdaj.Lambda;
import ch.lambdaj.function.convert.Converter;

/**
 * {@link JobsManager} implementation issuing SLURM commands.
 * 
 * @author Krzysztof Szafraski, AMG.net
 */
public class SlurmHpcJobsManager implements HpcJobsManager {

    private static final Logger LOG = Logger.getLogger(SlurmHpcJobsManager.class);

    @Inject
    private ApplicationsManager applicationsManager;

    @Inject
    private ProcessExecutor executor;

    @Inject
    private JobSubmissionOutputParser jobSubmissionParser;

    @Inject
    private JobDetailsOutputParser jobDetailsParser;

    @Inject
    private JobAccountingOutputParser jobAccountingParser;

    @Inject
    private JobsListOutputParser jobsListParser;

    @Inject
    private JobAccountingListOutputParser jobAccountingListOutputParser;

    @Inject
    private JobTerminationOutputParser jobTerminateOutputParser;

    @Inject
    @ConfigParam
    private String workDir;

    @Override
    public HpcJob schedule(String applicationName, HpcJobSubmission jobSubmission)
            throws ValidationException, InternalErrorException, ObjectNotFoundException {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Trying to schedule job, got submission: "
                    + ToStringBuilder.reflectionToString(jobSubmission, ToStringStyle.SHORT_PREFIX_STYLE));
        }

        Application application = applicationsManager.get(applicationName);
        validateJobSubmission(application, jobSubmission);

        List<String> command = new ArrayList<>();
        command.add("sbatch");
        addOption(command, "-J", jobSubmission.getName());

        ExecParams execParams = jobSubmission.getExecParams();
        if (execParams != null) {
            addOption(command, "-N", execParams.getNumNodes());
            addOption(command, "-c", execParams.getNumCores());
        }

        String appWorkDir = workDir + (StringUtils.endsWith(workDir, File.separator) ? "" : File.separator)
                + applicationName;
        addOption(command, "-D", appWorkDir);

        List<String> inputLines = new ArrayList<>();
        inputLines.add("#!/bin/sh");
        for (String singleCommand : application.getCommands()) {
            inputLines.add(singleCommand);
        }
        ProcessExecutionResult result = executor.run(command, inputLines);

        String jobId = jobSubmissionParser.parse(result);
        return get(jobId);
    }

    private void addOption(List<String> command, String option, Object value) {
        if (value != null) {
            command.add(option);
            command.add(value.toString());
        }
    }

    private void validateJobSubmission(Application application, HpcJobSubmission jobSubmission)
            throws ValidationException, InternalErrorException {
        if (jobSubmission == null) {
            String message = "HpcJobSubmission cannot be null";
            LOG.warn(message);
            throw new ValidationException(message);
        }

        if (application.getCommands().isEmpty()) {
            String message = "Applications has no commands";
            LOG.info(message);
            throw new ValidationException(message);
        }
    }

    @Override
    public HpcJob get(String id) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Trying to get details of job with id=" + id);
        }

        ProcessExecutionResult result = executor.run(Arrays.asList("scontrol", "show", "job", id));

        try {
            return jobDetailsParser.parse(result);
        } catch (ObjectNotFoundException e) {
            LOG.debug("Job not found on queue. Searching in accounting database.");
            ProcessExecutionResult accResult = executor.run(Arrays.asList("sacct", "-j", id, "-n", "-p",
                    "--format=JobID,JobName,user,state,ExitCode,submit"));
            return jobAccountingParser.parse(accResult);
        }
    }

    @Override
    public List<HpcJob> list() {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Trying to to list jobs");
        }

        ProcessExecutionResult result = executor.run(Arrays.asList("scontrol", "show", "jobs"));
        List<HpcJob> jobs = jobsListParser.parse(result);

        ProcessExecutionResult accResult = executor.run(Arrays.asList("sacct", "-a", "-n", "-p",
                "--format=JobID,JobName,user,state,ExitCode,submit", "--state=CA,CD,F"));
        List<HpcJob> accJobs = jobAccountingListOutputParser.parse(accResult);

        /*
         * scontrol returns job details for some time after the job failed or
         * completed. This means we can get details of the same job twice - from
         * scontrol and from sacct. In this case we only include the info we get
         * from scontrol.
         */
        Set<Integer> retrievedIds = new HashSet<>(Lambda.convert(jobs, new Converter<HpcJob, Integer>() {
            @Override
            public Integer convert(HpcJob from) {
                return from.getId();
            }
        }));

        for (HpcJob accJob : accJobs) {
            if (!retrievedIds.contains(accJob.getId())) {
                jobs.add(accJob);
            }
        }

        // sort jobs by id
        Collections.sort(jobs, new Comparator<HpcJob>() {
            @Override
            public int compare(HpcJob job1, HpcJob job2) {
                return job1.getId().compareTo(job2.getId());
            }
        });

        return jobs;
    }

    @Override
    public void terminate(String id) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Trying to terminate job with id=" + id);
        }

        ProcessExecutionResult result = executor.run(Arrays.asList("scancel", "--signal=KILL", id));
        jobTerminateOutputParser.parse(result);
    }
}