Java tutorial
/* * * Copyright 2015 Netflix, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.netflix.genie.core.services.impl; import com.netflix.genie.common.dto.Application; import com.netflix.genie.common.dto.Cluster; import com.netflix.genie.common.dto.Command; import com.netflix.genie.common.dto.JobExecution; import com.netflix.genie.common.dto.JobRequest; import com.netflix.genie.common.dto.JobStatusMessages; import com.netflix.genie.common.exceptions.GenieException; import com.netflix.genie.common.exceptions.GeniePreconditionException; import com.netflix.genie.common.exceptions.GenieServerException; import com.netflix.genie.core.events.GenieEventBus; import com.netflix.genie.core.events.JobFinishedEvent; import com.netflix.genie.core.events.JobFinishedReason; import com.netflix.genie.core.events.JobStartedEvent; import com.netflix.genie.core.jobs.JobConstants; import com.netflix.genie.core.jobs.JobExecutionEnvironment; import com.netflix.genie.core.jobs.workflow.WorkflowTask; import com.netflix.genie.core.services.JobPersistenceService; import com.netflix.genie.core.services.JobSubmitterService; import com.netflix.spectator.api.Registry; import com.netflix.spectator.api.Timer; import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import lombok.NonNull; import lombok.extern.slf4j.Slf4j; import org.springframework.core.io.Resource; import javax.validation.Valid; import javax.validation.constraints.Min; import javax.validation.constraints.NotNull; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.Writer; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; /** * Implementation of the Job Submitter service that runs the job locally on the same host. * * @author amsharma * @author tgianos * @since 3.0.0 */ @Slf4j public class LocalJobRunner implements JobSubmitterService { private final JobPersistenceService jobPersistenceService; private final List<WorkflowTask> jobWorkflowTasks; private final Resource baseWorkingDirPath; private final GenieEventBus genieEventBus; private final Timer overallSubmitTimer; private final Timer createJobDirTimer; private final Timer createRunScriptTimer; private final Timer executeJobTimer; private final Timer saveJobExecutionTimer; private final Timer publishJobStartedEventTimer; private final Timer createInitFailureDetailsFileTimer; /** * Constructor create the object. * * @param jobPersistenceService Implementation of the job persistence service * @param genieEventBus The event bus implementation to use * @param workflowTasks List of all the workflow tasks to be executed * @param genieWorkingDir Working directory for genie where it creates jobs directories * @param registry The metrics registry to use */ public LocalJobRunner(@NotNull final JobPersistenceService jobPersistenceService, @NonNull final GenieEventBus genieEventBus, @NotNull final List<WorkflowTask> workflowTasks, @NotNull final Resource genieWorkingDir, @NotNull final Registry registry) { this.jobPersistenceService = jobPersistenceService; this.genieEventBus = genieEventBus; this.jobWorkflowTasks = workflowTasks; this.baseWorkingDirPath = genieWorkingDir; // Metrics this.overallSubmitTimer = registry.timer("genie.jobs.submit.localRunner.overall.timer"); this.createJobDirTimer = registry.timer("genie.jobs.submit.localRunner.createJobDir.timer"); this.createRunScriptTimer = registry.timer("genie.jobs.submit.localRunner.createRunScript.timer"); this.executeJobTimer = registry.timer("genie.jobs.submit.localRunner.executeJob.timer"); this.saveJobExecutionTimer = registry.timer("genie.jobs.submit.localRunner.saveJobExecution.timer"); this.publishJobStartedEventTimer = registry .timer("genie.jobs.submit.localRunner.publishJobStartedEvent.timer"); this.createInitFailureDetailsFileTimer = registry .timer("genie.jobs.submit.localRunner.createInitFailureDetailsFile.timer"); } /** * {@inheritDoc} */ @SuppressFBWarnings(value = "REC_CATCH_EXCEPTION", justification = "We catch exception to make sure we always mark job failed.") @Override public void submitJob( @NotNull(message = "No job provided. Unable to submit job for execution.") @Valid final JobRequest jobRequest, @NotNull(message = "No cluster provided. Unable to submit job for execution") @Valid final Cluster cluster, @NotNull(message = "No command provided. Unable to submit job for execution") @Valid final Command command, @NotNull(message = "No applications provided. Unable to execute") final List<Application> applications, @Min(value = 1, message = "Memory can't be less than 1 MB") final int memory) throws GenieException { final long start = System.nanoTime(); try { log.info("Beginning local job submission for {}", jobRequest); final String id = jobRequest.getId().orElseThrow(() -> new GenieServerException("No job id found.")); try { final File jobWorkingDir = this.createJobWorkingDirectory(id); final File runScript = this.createRunScript(jobWorkingDir); // The map object stores the context for all the workflow tasks final Map<String, Object> context = this.createJobContext(jobRequest, cluster, command, applications, memory, jobWorkingDir); // Execute the job final JobExecution jobExecution = this.executeJob(context, runScript); // Job Execution will be null in local mode. if (jobExecution != null) { // Persist the jobExecution information. This also updates jobStatus to Running final long createJobExecutionStart = System.nanoTime(); try { log.info("Saving job execution for job {}", jobRequest.getId()); this.jobPersistenceService.setJobRunningInformation(id, jobExecution.getProcessId() .orElseThrow(() -> new GenieServerException( "No process id returned. Unable to persist")), jobExecution.getCheckDelay().orElse(Command.DEFAULT_CHECK_DELAY), jobExecution.getTimeout().orElseThrow(() -> new GenieServerException( "No timeout date returned. Unable to persist"))); } finally { this.saveJobExecutionTimer.record(System.nanoTime() - createJobExecutionStart, TimeUnit.NANOSECONDS); } // Publish a job start Event final long publishEventStart = System.nanoTime(); try { log.info("Publishing job started event for job {}", id); this.genieEventBus.publishSynchronousEvent(new JobStartedEvent(jobExecution, this)); } finally { this.publishJobStartedEventTimer.record(System.nanoTime() - publishEventStart, TimeUnit.NANOSECONDS); } } } catch (final GeniePreconditionException gpe) { log.error(gpe.getMessage(), gpe); this.createInitFailureDetailsFile(id, gpe); this.genieEventBus.publishAsynchronousEvent(new JobFinishedEvent(id, JobFinishedReason.INVALID, JobStatusMessages.SUBMIT_PRECONDITION_FAILURE, this)); throw gpe; } catch (final Exception e) { log.error(e.getMessage(), e); this.createInitFailureDetailsFile(id, e); this.genieEventBus.publishAsynchronousEvent(new JobFinishedEvent(id, JobFinishedReason.FAILED_TO_INIT, JobStatusMessages.SUBMIT_INIT_FAILURE, this)); throw e; } } finally { this.overallSubmitTimer.record(System.nanoTime() - start, TimeUnit.NANOSECONDS); } } private void createInitFailureDetailsFile(final String id, final Exception e) { final long start = System.nanoTime(); try { final File jobDir = new File(this.baseWorkingDirPath.getFile(), id); if (jobDir.exists()) { final File detailsFile = new File(jobDir, JobConstants.GENIE_INIT_FAILURE_MESSAGE_FILE_NAME); final boolean detailsFileExists = !detailsFile.createNewFile(); if (detailsFileExists) { log.warn("Init failure details file exists"); } try (final PrintWriter p = new PrintWriter( new OutputStreamWriter(new FileOutputStream(detailsFile), StandardCharsets.UTF_8))) { p.format(" *** Initialization failure for job: %s ***%n" + "%n" + "Exception: %s - %s%n" + "Trace:%n", id, e.getClass().getCanonicalName(), e.getMessage()); e.printStackTrace(p); } log.info("Created init failure details file {}", detailsFile); } else { log.error("Could not create init failure details file, job directory does not exist"); } } catch (Throwable t) { log.error("Failed to create init failure details file", t); } finally { this.createInitFailureDetailsFileTimer.record(System.nanoTime() - start, TimeUnit.NANOSECONDS); } } private File createJobWorkingDirectory(final String id) throws GenieException { final long start = System.nanoTime(); try { final File jobDir = new File(this.baseWorkingDirPath.getFile(), id); if (!jobDir.mkdirs()) { throw new GenieServerException( "Could not create job working directory directory: " + jobDir.getCanonicalPath()); } log.info("Created job dir {}", jobDir); return jobDir; } catch (final IOException ioe) { throw new GenieServerException("Could not resolve job working directory due to exception", ioe); } finally { this.createJobDirTimer.record(System.nanoTime() - start, TimeUnit.NANOSECONDS); } } private File createRunScript(final File jobWorkingDir) throws GenieException { final long start = System.nanoTime(); try { final File runScript = new File(jobWorkingDir, JobConstants.GENIE_JOB_LAUNCHER_SCRIPT); if (!runScript.exists()) { try { if (!runScript.createNewFile()) { throw new GenieServerException("Unable to create run script file due to unknown reason."); } } catch (final IOException ioe) { throw new GenieServerException("Unable to create run script file due to IOException.", ioe); } } if (!runScript.setExecutable(true)) { throw new GenieServerException("Unable to make run script executable"); } log.info("Created run script {}", runScript); return runScript; } finally { this.createRunScriptTimer.record(System.nanoTime() - start, TimeUnit.NANOSECONDS); } } private Map<String, Object> createJobContext(final JobRequest jobRequest, final Cluster cluster, final Command command, final List<Application> applications, final int memory, final File jobWorkingDir) throws GenieException { // construct the job execution environment object for this job request final JobExecutionEnvironment jee = new JobExecutionEnvironment.Builder(jobRequest, cluster, command, memory, jobWorkingDir).withApplications(applications).build(); // The map object stores the context for all the workflow tasks final Map<String, Object> context = new HashMap<>(); context.put(JobConstants.JOB_EXECUTION_ENV_KEY, jee); return context; } private JobExecution executeJob(final Map<String, Object> context, final File runScript) throws GenieException { final long start = System.nanoTime(); try (final Writer writer = new OutputStreamWriter(new FileOutputStream(runScript), StandardCharsets.UTF_8)) { final String jobId = ((JobExecutionEnvironment) context.get(JobConstants.JOB_EXECUTION_ENV_KEY)) .getJobRequest().getId() .orElseThrow(() -> new GenieServerException("No job id. Unable to execute")); log.info("Executing job workflow for job {}", jobId); context.put(JobConstants.WRITER_KEY, writer); for (WorkflowTask workflowTask : this.jobWorkflowTasks) { workflowTask.executeTask(context); if (Thread.currentThread().isInterrupted()) { log.info("Interrupted job workflow for job {}", jobId); break; } } log.info("Finished Executing job workflow for job {}", jobId); return (JobExecution) context.get(JobConstants.JOB_EXECUTION_DTO_KEY); } catch (final IOException ioe) { throw new GenieServerException("Failed to execute job due to: " + ioe.getMessage(), ioe); } finally { this.executeJobTimer.record(System.nanoTime() - start, TimeUnit.NANOSECONDS); } } }