com.netflix.genie.web.tasks.job.JobCompletionService.java Source code

Java tutorial

Introduction

Here is the source code for com.netflix.genie.web.tasks.job.JobCompletionService.java

Source

/*
 *
 *  Copyright 2016 Netflix, Inc.
 *
 *     Licensed under the Apache License, Version 2.0 (the "License");
 *     you may not use this file except in compliance with the License.
 *     You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 *     Unless required by applicable law or agreed to in writing, software
 *     distributed under the License is distributed on an "AS IS" BASIS,
 *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *     See the License for the specific language governing permissions and
 *     limitations under the License.
 *
 */
package com.netflix.genie.web.tasks.job;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Strings;
import com.google.common.collect.Maps;
import com.netflix.genie.common.dto.Application;
import com.netflix.genie.common.dto.Job;
import com.netflix.genie.common.dto.JobExecution;
import com.netflix.genie.common.dto.JobRequest;
import com.netflix.genie.common.dto.JobStatus;
import com.netflix.genie.common.exceptions.GenieException;
import com.netflix.genie.common.exceptions.GenieServerException;
import com.netflix.genie.core.events.JobFinishedEvent;
import com.netflix.genie.core.events.JobFinishedReason;
import com.netflix.genie.core.jobs.JobConstants;
import com.netflix.genie.core.jobs.JobDoneFile;
import com.netflix.genie.core.properties.JobsProperties;
import com.netflix.genie.core.services.JobPersistenceService;
import com.netflix.genie.core.services.JobSearchService;
import com.netflix.genie.core.services.MailService;
import com.netflix.genie.core.services.impl.GenieFileTransferService;
import com.netflix.spectator.api.Counter;
import com.netflix.spectator.api.Id;
import com.netflix.spectator.api.Registry;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.exec.CommandLine;
import org.apache.commons.exec.DefaultExecutor;
import org.apache.commons.exec.Executor;
import org.apache.commons.exec.PumpStreamHandler;
import org.apache.commons.io.FileUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.core.io.Resource;
import org.springframework.retry.support.RetryTemplate;
import org.springframework.stereotype.Service;

import javax.validation.constraints.NotNull;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

/**
 * A class that has the methods to perform various tasks when a job completes.
 *
 * @author amsharma
 * @author tgianos
 * @since 3.0.0
 */
@Slf4j
@Service
public class JobCompletionService {

    private static final String STATUS_TAG = "status";
    private static final String ERROR_TAG = "error";

    private final JobPersistenceService jobPersistenceService;
    private final JobSearchService jobSearchService;
    private final GenieFileTransferService genieFileTransferService;
    private final File baseWorkingDir;
    private final MailService mailServiceImpl;
    private final Executor executor;
    private final boolean deleteArchiveFile;
    private final boolean deleteDependencies;
    private final boolean runAsUserEnabled;

    // Metrics
    private final Registry registry;
    private final Id jobCompletionId;
    private final Counter emailSuccessRate;
    private final Counter emailFailureRate;
    private final Counter archivalFailureRate;
    private final Counter doneFileProcessingFailureRate;
    private final Counter finalStatusUpdateFailureRate;
    private final Counter processGroupCleanupFailureRate;
    private final Counter archiveFileDeletionFailure;
    private final Counter deleteDependenciesFailure;
    private final RetryTemplate retryTemplate;
    private final ObjectMapper objectMapper = new ObjectMapper();

    /**
     * Constructor.
     *
     * @param jobSearchService         An implementation of the job search service.
     * @param jobPersistenceService    An implementation of the job persistence service.
     * @param genieFileTransferService An implementation of the Genie File Transfer service.
     * @param genieWorkingDir          The working directory where all job directories are created.
     * @param mailServiceImpl          An implementation of the mail service.
     * @param registry                 The metrics registry to use
     * @param jobsProperties           The properties relating to running jobs
     * @param retryTemplate            Retry template for retrying remote calls
     * @throws GenieException if there is a problem
     */
    @Autowired
    public JobCompletionService(final JobPersistenceService jobPersistenceService,
            final JobSearchService jobSearchService, final GenieFileTransferService genieFileTransferService,
            final Resource genieWorkingDir, final MailService mailServiceImpl, final Registry registry,
            final JobsProperties jobsProperties,
            @Qualifier("genieRetryTemplate") @NotNull final RetryTemplate retryTemplate) throws GenieException {
        this.jobPersistenceService = jobPersistenceService;
        this.jobSearchService = jobSearchService;
        this.genieFileTransferService = genieFileTransferService;
        this.mailServiceImpl = mailServiceImpl;
        this.deleteArchiveFile = jobsProperties.getCleanup().isDeleteArchiveFile();
        this.deleteDependencies = jobsProperties.getCleanup().isDeleteDependencies();
        this.runAsUserEnabled = jobsProperties.getUsers().isRunAsUserEnabled();

        this.executor = new DefaultExecutor();
        this.executor.setStreamHandler(new PumpStreamHandler(null, null));

        try {
            this.baseWorkingDir = genieWorkingDir.getFile();
        } catch (IOException gse) {
            throw new GenieServerException("Could not load the base path from resource");
        }

        // Set up the metrics
        this.registry = registry;
        this.jobCompletionId = registry.createId("genie.jobs.completion.timer");
        this.emailSuccessRate = registry.counter("genie.jobs.email.success.rate");
        this.emailFailureRate = registry.counter("genie.jobs.email.failure.rate");
        this.archivalFailureRate = registry.counter("genie.jobs.archivalFailure.rate");
        this.doneFileProcessingFailureRate = registry.counter("genie.jobs.doneFileProcessingFailure.rate");
        this.finalStatusUpdateFailureRate = registry.counter("genie.jobs.finalStatusUpdateFailure.rate");
        this.processGroupCleanupFailureRate = registry.counter("genie.jobs.processGroupCleanupFailure.rate");
        this.archiveFileDeletionFailure = registry.counter("genie.jobs.archiveFileDeletionFailure.rate");
        this.deleteDependenciesFailure = registry.counter("genie.jobs.deleteDependenciesFailure.rate");
        // Retry template
        this.retryTemplate = retryTemplate;
    }

    /**
     * Event listener for when a job is completed. Updates the status of the job.
     *
     * @param event The Spring Boot application ready event to startup on
     * @throws GenieException If there is any problem
     */
    void handleJobCompletion(final JobFinishedEvent event) throws GenieException {
        final long start = System.nanoTime();
        final String jobId = event.getId();
        final Map<String, String> tags = Maps.newHashMap();

        try {
            final Job job = retryTemplate.execute(context -> getJob(jobId));

            final JobStatus status = job.getStatus();

            // Make sure the job isn't already done before doing something
            if (status.isActive()) {
                try {
                    this.retryTemplate.execute(context -> updateJob(job, event, tags));
                } catch (Exception e) {
                    log.error("Failed updating for job: {}", jobId, e);
                    tags.put(ERROR_TAG, "JOB_UPDATE_FAILURE");
                    this.finalStatusUpdateFailureRate.increment();
                }
                // Things that should be done either way
                try {
                    this.retryTemplate.execute(context -> processJobDir(job));
                } catch (Exception e) {
                    log.error("Failed archiving directory for job: {}", jobId, e);
                    tags.put(ERROR_TAG, "JOB_DIRECTORY_FAILURE");
                    this.archivalFailureRate.increment();
                }
                try {
                    this.retryTemplate.execute(context -> sendEmail(jobId));
                } catch (Exception e) {
                    log.error("Failed sending email for job: {}", jobId, e);
                    tags.put(ERROR_TAG, "SEND_EMAIL_FAILURE");
                    this.emailFailureRate.increment();
                }
            }
        } catch (Exception e) {
            log.error("Failed getting job with id: {}", jobId, e);
            tags.put(ERROR_TAG, "GET_JOB_FAILURE");
        } finally {
            final Id timerId = this.jobCompletionId.withTags(tags);
            this.registry.timer(timerId).record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
        }
    }

    private Job getJob(final String jobId) throws GenieException {
        return this.jobSearchService.getJob(jobId);
    }

    private Void updateJob(final Job job, final JobFinishedEvent event, final Map<String, String> tags)
            throws GenieException {
        final String jobId = event.getId();
        final JobStatus status = job.getStatus();
        // Now we know this job should be marked in one of the finished states
        JobStatus eventStatus = null;
        if (status == JobStatus.INIT) {
            switch (event.getReason()) {
            case KILLED:
                eventStatus = JobStatus.KILLED;
                break;
            case INVALID:
                eventStatus = JobStatus.INVALID;
                break;
            case FAILED_TO_INIT:
                eventStatus = JobStatus.FAILED;
                break;
            case PROCESS_COMPLETED:
                eventStatus = JobStatus.SUCCEEDED;
                break;
            case SYSTEM_CRASH:
                eventStatus = JobStatus.FAILED;
                break;
            default:
                eventStatus = JobStatus.INVALID;
                log.warn("Unknown event status for job: {}", jobId);
            }
        } else {
            if (event.getReason() != JobFinishedReason.SYSTEM_CRASH) {
                try {
                    final String finalStatus = this.retryTemplate
                            .execute(context -> updateFinalStatusForJob(jobId).toString());
                    tags.put(STATUS_TAG, finalStatus);
                    cleanupProcesses(jobId);
                } catch (Exception e) {
                    tags.put(ERROR_TAG, "JOB_UPDATE_FINAL_STATUS_FAILURE");
                    log.error("Failed updating the exit code and status for job: {}", jobId, e);
                    this.finalStatusUpdateFailureRate.increment();
                }
            } else {
                eventStatus = JobStatus.FAILED;
            }
        }

        if (eventStatus != null) {
            this.jobPersistenceService.updateJobStatus(jobId, eventStatus, event.getMessage());
            tags.put(STATUS_TAG, eventStatus.toString());
        }
        return null;
    }

    /**
     * An external fail-safe mechanism to clean up processes left behind by the run.sh after the
     * job is killed or failed. This method is a no-op for jobs whose status is INVALID.
     *
     * @param jobId The id of the job to cleanup processes for.
     */
    private void cleanupProcesses(final String jobId) {
        try {
            if (!this.jobSearchService.getJobStatus(jobId).equals(JobStatus.INVALID)) {
                this.jobSearchService.getJobExecution(jobId).getProcessId().ifPresent(pid -> {
                    try {
                        final CommandLine commandLine = new CommandLine(JobConstants.UNIX_PKILL_COMMAND);
                        commandLine.addArgument(JobConstants.getKillFlag());
                        commandLine.addArgument(Integer.toString(pid));
                        this.executor.execute(commandLine);

                        // The process group should not exist and the above code should always throw and exception.
                        // If it does not then the bash script is not cleaning up stuff well during kills
                        // or the script is done but child processes are still remaining. This metric tracks all that.
                        this.processGroupCleanupFailureRate.increment();
                    } catch (final Exception e) {
                        log.debug("Received expected exception. Ignoring.");
                    }
                });
            }
        } catch (final GenieException ge) {
            log.error("Unable to cleanup process for job due to exception. " + jobId, ge);
            this.processGroupCleanupFailureRate.increment();
        }
    }

    /**
     * Updates the status of the job.
     *
     * @param id The job id.
     * @return the final job status
     * @throws GenieException If there is any problem
     */
    private JobStatus updateFinalStatusForJob(final String id) throws GenieException {
        log.debug("Updating the status of the job.");

        try {
            final File jobDir = new File(this.baseWorkingDir, id);
            final JobDoneFile jobDoneFile = this.objectMapper
                    .readValue(new File(this.baseWorkingDir + "/" + id + "/genie/genie.done"), JobDoneFile.class);
            final int exitCode = jobDoneFile.getExitCode();
            // Read the size of STD OUT and STD ERR files
            final File stdOut = new File(jobDir, JobConstants.STDOUT_LOG_FILE_NAME);
            final Long stdOutSize = stdOut.exists() && stdOut.isFile() ? stdOut.length() : null;
            final File stdErr = new File(jobDir, JobConstants.STDERR_LOG_FILE_NAME);
            final Long stdErrSize = stdErr.exists() && stdErr.isFile() ? stdErr.length() : null;
            final JobStatus finalStatus;
            switch (exitCode) {
            case JobExecution.KILLED_EXIT_CODE:
                this.jobPersistenceService.setJobCompletionInformation(id, exitCode, JobStatus.KILLED,
                        "Job was killed.", stdOutSize, stdErrSize);
                finalStatus = JobStatus.KILLED;
                break;
            case JobExecution.SUCCESS_EXIT_CODE:
                this.jobPersistenceService.setJobCompletionInformation(id, exitCode, JobStatus.SUCCEEDED,
                        "Job finished successfully.", stdOutSize, stdErrSize);
                finalStatus = JobStatus.SUCCEEDED;
                break;
            // catch all for non-zero and non-zombie, killed and failed exit codes
            default:
                this.jobPersistenceService.setJobCompletionInformation(id, exitCode, JobStatus.FAILED,
                        "Job failed.", stdOutSize, stdErrSize);
                finalStatus = JobStatus.FAILED;
                break;
            }
            return finalStatus;
        } catch (final IOException ioe) {
            this.doneFileProcessingFailureRate.increment();
            // The run.sh should theoretically ALWAYS generate a done file so we should never hit this code.
            // But if we do handle it generate a metric for it which we can track
            log.error("Could not load the done file for job {}. Marking it as failed.", id, ioe);
            this.jobPersistenceService.updateJobStatus(id, JobStatus.FAILED, "Genie could not load done file.");
            return JobStatus.FAILED;
        }
    }

    /**
     * Delete the application dependencies off disk to save space.
     *
     * @param jobId  The ID of the job to delete dependencies for
     * @param jobDir The job working directory
     */
    private void deleteApplicationDependencies(final String jobId, final File jobDir) {
        log.debug("Deleting dependencies as its enabled.");
        if (jobDir.exists()) {
            try {
                final List<String> appIds = this.jobSearchService.getJobApplications(jobId).stream()
                        .map(Application::getId).filter(Optional::isPresent).map(Optional::get)
                        .collect(Collectors.toList());

                for (final String appId : appIds) {
                    final File appDependencyDir = new File(jobDir,
                            JobConstants.GENIE_PATH_VAR + JobConstants.FILE_PATH_DELIMITER
                                    + JobConstants.APPLICATION_PATH_VAR + JobConstants.FILE_PATH_DELIMITER + appId
                                    + JobConstants.FILE_PATH_DELIMITER + JobConstants.DEPENDENCY_FILE_PATH_PREFIX);

                    if (appDependencyDir.exists()) {
                        if (this.runAsUserEnabled) {
                            final CommandLine deleteCommand = new CommandLine("sudo");
                            deleteCommand.addArgument("rm");
                            deleteCommand.addArgument("-rf");
                            deleteCommand.addArgument(appDependencyDir.getCanonicalPath());
                            log.debug("Delete command is {}", deleteCommand.toString());
                            this.executor.execute(deleteCommand);
                        } else {
                            FileUtils.deleteDirectory(appDependencyDir);
                        }
                    }
                }
            } catch (Exception e) {
                log.error("Could not delete job dependencies after completion for job: {} due to error {}", jobId,
                        e);
                this.deleteDependenciesFailure.increment();
            }
        }
    }

    /**
     * Uploads the job directory to the archive location.
     *
     * @param job The job.
     * @throws GenieException if there is any problem
     */
    private boolean processJobDir(final Job job) throws GenieException, IOException {
        log.debug("Got a job finished event. Will process job directory.");
        boolean result = false;
        final Optional<String> oJobId = job.getId();

        // The deletion of dependencies and archiving only happens for job requests which are not Invalid.
        if (oJobId.isPresent()
                && !(this.jobSearchService.getJobStatus(job.getId().get()).equals(JobStatus.INVALID))) {
            final String jobId = oJobId.get();
            final File jobDir = new File(this.baseWorkingDir, jobId);

            if (jobDir.exists()) {
                if (this.deleteDependencies) {
                    this.deleteApplicationDependencies(jobId, jobDir);
                }

                final Optional<String> archiveLocation = job.getArchiveLocation();
                if (archiveLocation.isPresent() && !Strings.isNullOrEmpty(archiveLocation.get())) {
                    log.debug("Archiving job directory");
                    // Create the tar file
                    final File localArchiveFile = new File(jobDir, "genie/logs/" + jobId + ".tar.gz");

                    final CommandLine commandLine;
                    if (this.runAsUserEnabled) {
                        commandLine = new CommandLine("sudo");
                        commandLine.addArgument("tar");
                    } else {
                        commandLine = new CommandLine("tar");
                    }
                    commandLine.addArgument("-c");
                    commandLine.addArgument("-z");
                    commandLine.addArgument("-f");
                    commandLine.addArgument(localArchiveFile.getCanonicalPath());
                    commandLine.addArgument("./");

                    this.executor.setWorkingDirectory(jobDir);

                    log.debug("Archive command : {}", commandLine.toString());
                    this.executor.execute(commandLine);

                    // Upload the tar file to remote location
                    this.genieFileTransferService.putFile(localArchiveFile.getCanonicalPath(),
                            archiveLocation.get());

                    // At this point the archive file is successfully uploaded to archive location specified in the job.
                    // Now we can delete it from local disk to save space if enabled.
                    if (this.deleteArchiveFile) {
                        log.debug("Deleting archive file");
                        try {
                            if (this.runAsUserEnabled) {
                                final CommandLine deleteCommand = new CommandLine("sudo");
                                deleteCommand.addArgument("rm");
                                deleteCommand.addArgument("-f");
                                deleteCommand.addArgument(localArchiveFile.getCanonicalPath());

                                this.executor.setWorkingDirectory(jobDir);
                                log.debug("Delete command: {}", deleteCommand.toString());
                                this.executor.execute(deleteCommand);
                            } else if (!localArchiveFile.delete()) {
                                log.error("Failed to delete archive file for job: {}", jobId);
                                this.archiveFileDeletionFailure.increment();
                            }
                        } catch (final Exception e) {
                            log.error("Failed to delete archive file for job: {}", jobId, e);
                            this.archiveFileDeletionFailure.increment();
                        }
                    }
                    result = true;
                }
            }
        }
        return result;
    }

    /**
     * Sends an email when the job is completed. Returns true if an email has been sent.
     *
     * @param jobId The job id.
     * @throws GenieException If there is any problem.
     */
    private boolean sendEmail(final String jobId) throws GenieException {
        final JobRequest jobRequest = this.jobSearchService.getJobRequest(jobId);
        boolean result = false;
        final Optional<String> email = jobRequest.getEmail();

        if (email.isPresent() && !Strings.isNullOrEmpty(email.get())) {
            log.debug("Got a job finished event. Sending email: {}", email.get());
            final JobStatus status = this.jobSearchService.getJobStatus(jobId);

            final StringBuilder subject = new StringBuilder().append("Genie Job Finished. Id: [").append(jobId)
                    .append("], Name: [").append(jobRequest.getName()).append("], Status: [").append(status)
                    .append("].");

            final StringBuilder body = new StringBuilder().append("Id: [" + jobId + "]\n")
                    .append("Name: [" + jobRequest.getName() + "]\n").append("Status: [" + status + "]\n")
                    .append("User: [" + jobRequest.getUser() + "]\n")
                    .append("Description: [" + jobRequest.getDescription() + "]\n")
                    .append("Tags: " + jobRequest.getTags() + "\n");

            this.mailServiceImpl.sendEmail(email.get(), subject.toString(), body.toString());
            result = true;
            this.emailSuccessRate.increment();
        }
        return result;
    }
}