/*
*
* Copyright 2016 Netflix, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.netflix.genie.web.tasks.job;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Strings;
import com.google.common.collect.Maps;
import com.netflix.genie.common.dto.Application;
import com.netflix.genie.common.dto.Job;
import com.netflix.genie.common.dto.JobExecution;
import com.netflix.genie.common.dto.JobRequest;
import com.netflix.genie.common.dto.JobStatus;
import com.netflix.genie.common.exceptions.GenieException;
import com.netflix.genie.common.exceptions.GenieServerException;
import com.netflix.genie.core.events.JobFinishedEvent;
import com.netflix.genie.core.events.JobFinishedReason;
import com.netflix.genie.core.jobs.JobConstants;
import com.netflix.genie.core.jobs.JobDoneFile;
import com.netflix.genie.core.properties.JobsProperties;
import com.netflix.genie.core.services.JobPersistenceService;
import com.netflix.genie.core.services.JobSearchService;
import com.netflix.genie.core.services.MailService;
import com.netflix.genie.core.services.impl.GenieFileTransferService;
import com.netflix.spectator.api.Counter;
import com.netflix.spectator.api.Id;
import com.netflix.spectator.api.Registry;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.exec.CommandLine;
import org.apache.commons.exec.DefaultExecutor;
import org.apache.commons.exec.Executor;
import org.apache.commons.exec.PumpStreamHandler;
import org.apache.commons.io.FileUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.core.io.Resource;
import org.springframework.retry.support.RetryTemplate;
import org.springframework.stereotype.Service;
import javax.validation.constraints.NotNull;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
/**
* A class that has the methods to perform various tasks when a job completes.
*
* @author amsharma
* @author tgianos
* @since 3.0.0
*/
@Slf4j
@Service
public class JobCompletionService {
private static final String STATUS_TAG = "status";
private static final String ERROR_TAG = "error";
private final JobPersistenceService jobPersistenceService;
private final JobSearchService jobSearchService;
private final GenieFileTransferService genieFileTransferService;
private final File baseWorkingDir;
private final MailService mailServiceImpl;
private final Executor executor;
private final boolean deleteArchiveFile;
private final boolean deleteDependencies;
private final boolean runAsUserEnabled;
// Metrics
private final Registry registry;
private final Id jobCompletionId;
private final Counter emailSuccessRate;
private final Counter emailFailureRate;
private final Counter archivalFailureRate;
private final Counter doneFileProcessingFailureRate;
private final Counter finalStatusUpdateFailureRate;
private final Counter processGroupCleanupFailureRate;
private final Counter archiveFileDeletionFailure;
private final Counter deleteDependenciesFailure;
private final RetryTemplate retryTemplate;
private final ObjectMapper objectMapper = new ObjectMapper();
/**
* Constructor.
*
* @param jobSearchService An implementation of the job search service.
* @param jobPersistenceService An implementation of the job persistence service.
* @param genieFileTransferService An implementation of the Genie File Transfer service.
* @param genieWorkingDir The working directory where all job directories are created.
* @param mailServiceImpl An implementation of the mail service.
* @param registry The metrics registry to use
* @param jobsProperties The properties relating to running jobs
* @param retryTemplate Retry template for retrying remote calls
* @throws GenieException if there is a problem
*/
@Autowired
public JobCompletionService(
final JobPersistenceService jobPersistenceService,
final JobSearchService jobSearchService,
final GenieFileTransferService genieFileTransferService,
final Resource genieWorkingDir,
final MailService mailServiceImpl,
final Registry registry,
final JobsProperties jobsProperties,
@Qualifier("genieRetryTemplate") @NotNull final RetryTemplate retryTemplate
) throws GenieException {
this.jobPersistenceService = jobPersistenceService;
this.jobSearchService = jobSearchService;
this.genieFileTransferService = genieFileTransferService;
this.mailServiceImpl = mailServiceImpl;
this.deleteArchiveFile = jobsProperties.getCleanup().isDeleteArchiveFile();
this.deleteDependencies = jobsProperties.getCleanup().isDeleteDependencies();
this.runAsUserEnabled = jobsProperties.getUsers().isRunAsUserEnabled();
this.executor = new DefaultExecutor();
this.executor.setStreamHandler(new PumpStreamHandler(null, null));
try {
this.baseWorkingDir = genieWorkingDir.getFile();
} catch (IOException gse) {
throw new GenieServerException("Could not load the base path from resource");
}
// Set up the metrics
this.registry = registry;
this.jobCompletionId = registry.createId("genie.jobs.completion.timer");
this.emailSuccessRate = registry.counter("genie.jobs.email.success.rate");
this.emailFailureRate = registry.counter("genie.jobs.email.failure.rate");
this.archivalFailureRate = registry.counter("genie.jobs.archivalFailure.rate");
this.doneFileProcessingFailureRate = registry.counter("genie.jobs.doneFileProcessingFailure.rate");
this.finalStatusUpdateFailureRate = registry.counter("genie.jobs.finalStatusUpdateFailure.rate");
this.processGroupCleanupFailureRate = registry.counter("genie.jobs.processGroupCleanupFailure.rate");
this.archiveFileDeletionFailure = registry.counter("genie.jobs.archiveFileDeletionFailure.rate");
this.deleteDependenciesFailure = registry.counter("genie.jobs.deleteDependenciesFailure.rate");
// Retry template
this.retryTemplate = retryTemplate;
}
/**
* Event listener for when a job is completed. Updates the status of the job.
*
* @param event The Spring Boot application ready event to startup on
* @throws GenieException If there is any problem
*/
void handleJobCompletion(final JobFinishedEvent event) throws GenieException {
final long start = System.nanoTime();
final String jobId = event.getId();
final Map<String, String> tags = Maps.newHashMap();
try {
final Job job = retryTemplate.execute(context -> getJob(jobId));
final JobStatus status = job.getStatus();
// Make sure the job isn't already done before doing something
if (status.isActive()) {
try {
this.retryTemplate.execute(context -> updateJob(job, event, tags));
} catch (Exception e) {
log.error("Failed updating for job: {}", jobId, e);
tags.put(ERROR_TAG, "JOB_UPDATE_FAILURE");
this.finalStatusUpdateFailureRate.increment();
}
// Things that should be done either way
try {
this.retryTemplate.execute(context -> processJobDir(job));
} catch (Exception e) {
log.error("Failed archiving directory for job: {}", jobId, e);
tags.put(ERROR_TAG, "JOB_DIRECTORY_FAILURE");
this.archivalFailureRate.increment();
}
try {
this.retryTemplate.execute(context -> sendEmail(jobId));
} catch (Exception e) {
log.error("Failed sending email for job: {}", jobId, e);
tags.put(ERROR_TAG, "SEND_EMAIL_FAILURE");
this.emailFailureRate.increment();
}
}
} catch (Exception e) {
log.error("Failed getting job with id: {}", jobId, e);
tags.put(ERROR_TAG, "GET_JOB_FAILURE");
} finally {
final Id timerId = this.jobCompletionId.withTags(tags);
this.registry.timer(timerId).record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
}
}
private Job getJob(final String jobId) throws GenieException {
return this.jobSearchService.getJob(jobId);
}
private Void updateJob(final Job job, final JobFinishedEvent event, final Map<String, String> tags)
throws GenieException {
final String jobId = event.getId();
final JobStatus status = job.getStatus();
// Now we know this job should be marked in one of the finished states
JobStatus eventStatus = null;
if (status == JobStatus.INIT) {
switch (event.getReason()) {
case KILLED:
eventStatus = JobStatus.KILLED;
break;
case INVALID:
eventStatus = JobStatus.INVALID;
break;
case FAILED_TO_INIT:
eventStatus = JobStatus.FAILED;
break;
case PROCESS_COMPLETED:
eventStatus = JobStatus.SUCCEEDED;
break;
case SYSTEM_CRASH:
eventStatus = JobStatus.FAILED;
break;
default:
eventStatus = JobStatus.INVALID;
log.warn("Unknown event status for job: {}", jobId);
}
} else {
if (event.getReason() != JobFinishedReason.SYSTEM_CRASH) {
try {
final String finalStatus =
this.retryTemplate.execute(context -> updateFinalStatusForJob(jobId).toString());
tags.put(STATUS_TAG, finalStatus);
cleanupProcesses(jobId);
} catch (Exception e) {
tags.put(ERROR_TAG, "JOB_UPDATE_FINAL_STATUS_FAILURE");
log.error("Failed updating the exit code and status for job: {}", jobId, e);
this.finalStatusUpdateFailureRate.increment();
}
} else {
eventStatus = JobStatus.FAILED;
}
}
if (eventStatus != null) {
this.jobPersistenceService.updateJobStatus(jobId, eventStatus, event.getMessage());
tags.put(STATUS_TAG, eventStatus.toString());
}
return null;
}
/**
* An external fail-safe mechanism to clean up processes left behind by the run.sh after the
* job is killed or failed. This method is a no-op for jobs whose status is INVALID.
*
* @param jobId The id of the job to cleanup processes for.
*/
private void cleanupProcesses(final String jobId) {
try {
if (!this.jobSearchService.getJobStatus(jobId).equals(JobStatus.INVALID)) {
this.jobSearchService.getJobExecution(jobId).getProcessId().ifPresent(pid -> {
try {
final CommandLine commandLine = new CommandLine(JobConstants.UNIX_PKILL_COMMAND);
commandLine.addArgument(JobConstants.getKillFlag());
commandLine.addArgument(Integer.toString(pid));
this.executor.execute(commandLine);
// The process group should not exist and the above code should always throw and exception.
// If it does not then the bash script is not cleaning up stuff well during kills
// or the script is done but child processes are still remaining. This metric tracks all that.
this.processGroupCleanupFailureRate.increment();
} catch (final Exception e) {
log.debug("Received expected exception. Ignoring.");
}
});
}
} catch (final GenieException ge) {
log.error("Unable to cleanup process for job due to exception. " + jobId, ge);
this.processGroupCleanupFailureRate.increment();
}
}
/**
* Updates the status of the job.
*
* @param id The job id.
* @return the final job status
* @throws GenieException If there is any problem
*/
private JobStatus updateFinalStatusForJob(final String id) throws GenieException {
log.debug("Updating the status of the job.");
try {
final File jobDir = new File(this.baseWorkingDir, id);
final JobDoneFile jobDoneFile = this.objectMapper.readValue(
new File(this.baseWorkingDir + "/" + id + "/genie/genie.done"),
JobDoneFile.class
);
final int exitCode = jobDoneFile.getExitCode();
// Read the size of STD OUT and STD ERR files
final File stdOut = new File(jobDir, JobConstants.STDOUT_LOG_FILE_NAME);
final Long stdOutSize = stdOut.exists() && stdOut.isFile() ? stdOut.length() : null;
final File stdErr = new File(jobDir, JobConstants.STDERR_LOG_FILE_NAME);
final Long stdErrSize = stdErr.exists() && stdErr.isFile() ? stdErr.length() : null;
final JobStatus finalStatus;
switch (exitCode) {
case JobExecution.KILLED_EXIT_CODE:
this.jobPersistenceService.setJobCompletionInformation(
id,
exitCode,
JobStatus.KILLED,
"Job was killed.",
stdOutSize,
stdErrSize
);
finalStatus = JobStatus.KILLED;
break;
case JobExecution.SUCCESS_EXIT_CODE:
this.jobPersistenceService.setJobCompletionInformation(
id,
exitCode,
JobStatus.SUCCEEDED,
"Job finished successfully.",
stdOutSize,
stdErrSize
);
finalStatus = JobStatus.SUCCEEDED;
break;
// catch all for non-zero and non-zombie, killed and failed exit codes
default:
this.jobPersistenceService.setJobCompletionInformation(
id,
exitCode,
JobStatus.FAILED,
"Job failed.",
stdOutSize,
stdErrSize
);
finalStatus = JobStatus.FAILED;
break;
}
return finalStatus;
} catch (final IOException ioe) {
this.doneFileProcessingFailureRate.increment();
// The run.sh should theoretically ALWAYS generate a done file so we should never hit this code.
// But if we do handle it generate a metric for it which we can track
log.error("Could not load the done file for job {}. Marking it as failed.", id, ioe);
this.jobPersistenceService.updateJobStatus(
id,
JobStatus.FAILED,
"Genie could not load done file."
);
return JobStatus.FAILED;
}
}
/**
* Delete the application dependencies off disk to save space.
*
* @param jobId The ID of the job to delete dependencies for
* @param jobDir The job working directory
*/
private void deleteApplicationDependencies(final String jobId, final File jobDir) {
log.debug("Deleting dependencies as its enabled.");
if (jobDir.exists()) {
try {
final List<String> appIds = this.jobSearchService
.getJobApplications(jobId)
.stream()
.map(Application::getId)
.filter(Optional::isPresent)
.map(Optional::get)
.collect(Collectors.toList());
for (final String appId : appIds) {
final File appDependencyDir = new File(
jobDir,
JobConstants.GENIE_PATH_VAR
+ JobConstants.FILE_PATH_DELIMITER
+ JobConstants.APPLICATION_PATH_VAR
+ JobConstants.FILE_PATH_DELIMITER
+ appId
+ JobConstants.FILE_PATH_DELIMITER
+ JobConstants.DEPENDENCY_FILE_PATH_PREFIX
);
if (appDependencyDir.exists()) {
if (this.runAsUserEnabled) {
final CommandLine deleteCommand = new CommandLine("sudo");
deleteCommand.addArgument("rm");
deleteCommand.addArgument("-rf");
deleteCommand.addArgument(appDependencyDir.getCanonicalPath());
log.debug("Delete command is {}", deleteCommand.toString());
this.executor.execute(deleteCommand);
} else {
FileUtils.deleteDirectory(appDependencyDir);
}
}
}
} catch (Exception e) {
log.error("Could not delete job dependencies after completion for job: {} due to error {}",
jobId, e);
this.deleteDependenciesFailure.increment();
}
}
}
/**
* Uploads the job directory to the archive location.
*
* @param job The job.
* @throws GenieException if there is any problem
*/
private boolean processJobDir(final Job job) throws GenieException, IOException {
log.debug("Got a job finished event. Will process job directory.");
boolean result = false;
final Optional<String> oJobId = job.getId();
// The deletion of dependencies and archiving only happens for job requests which are not Invalid.
if (oJobId.isPresent() && !(this.jobSearchService.getJobStatus(job.getId().get()).equals(JobStatus.INVALID))) {
final String jobId = oJobId.get();
final File jobDir = new File(this.baseWorkingDir, jobId);
if (jobDir.exists()) {
if (this.deleteDependencies) {
this.deleteApplicationDependencies(jobId, jobDir);
}
final Optional<String> archiveLocation = job.getArchiveLocation();
if (archiveLocation.isPresent() && !Strings.isNullOrEmpty(archiveLocation.get())) {
log.debug("Archiving job directory");
// Create the tar file
final File localArchiveFile = new File(jobDir, "genie/logs/" + jobId + ".tar.gz");
final CommandLine commandLine;
if (this.runAsUserEnabled) {
commandLine = new CommandLine("sudo");
commandLine.addArgument("tar");
} else {
commandLine = new CommandLine("tar");
}
commandLine.addArgument("-c");
commandLine.addArgument("-z");
commandLine.addArgument("-f");
commandLine.addArgument(localArchiveFile.getCanonicalPath());
commandLine.addArgument("./");
this.executor.setWorkingDirectory(jobDir);
log.debug("Archive command : {}", commandLine.toString());
this.executor.execute(commandLine);
// Upload the tar file to remote location
this.genieFileTransferService.putFile(localArchiveFile.getCanonicalPath(), archiveLocation.get());
// At this point the archive file is successfully uploaded to archive location specified in the job.
// Now we can delete it from local disk to save space if enabled.
if (this.deleteArchiveFile) {
log.debug("Deleting archive file");
try {
if (this.runAsUserEnabled) {
final CommandLine deleteCommand = new CommandLine("sudo");
deleteCommand.addArgument("rm");
deleteCommand.addArgument("-f");
deleteCommand.addArgument(localArchiveFile.getCanonicalPath());
this.executor.setWorkingDirectory(jobDir);
log.debug("Delete command: {}", deleteCommand.toString());
this.executor.execute(deleteCommand);
} else if (!localArchiveFile.delete()) {
log.error("Failed to delete archive file for job: {}", jobId);
this.archiveFileDeletionFailure.increment();
}
} catch (final Exception e) {
log.error("Failed to delete archive file for job: {}", jobId, e);
this.archiveFileDeletionFailure.increment();
}
}
result = true;
}
}
}
return result;
}
/**
* Sends an email when the job is completed. Returns true if an email has been sent.
*
* @param jobId The job id.
* @throws GenieException If there is any problem.
*/
private boolean sendEmail(final String jobId) throws GenieException {
final JobRequest jobRequest = this.jobSearchService.getJobRequest(jobId);
boolean result = false;
final Optional<String> email = jobRequest.getEmail();
if (email.isPresent() && !Strings.isNullOrEmpty(email.get())) {
log.debug("Got a job finished event. Sending email: {}", email.get());
final JobStatus status = this.jobSearchService.getJobStatus(jobId);
final StringBuilder subject = new StringBuilder()
.append("Genie Job Finished. Id: [")
.append(jobId)
.append("], Name: [")
.append(jobRequest.getName())
.append("], Status: [")
.append(status)
.append("].");
final StringBuilder body = new StringBuilder()
.append("Id: [" + jobId + "]\n")
.append("Name: [" + jobRequest.getName() + "]\n")
.append("Status: [" + status + "]\n")
.append("User: [" + jobRequest.getUser() + "]\n")
.append("Description: [" + jobRequest.getDescription() + "]\n")
.append("Tags: " + jobRequest.getTags() + "\n");
this.mailServiceImpl.sendEmail(
email.get(),
subject.toString(),
body.toString()
);
result = true;
this.emailSuccessRate.increment();
}
return result;
}
}