/*
*
* Copyright 2016 Netflix, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.netflix.genie.web.tasks.job;
import com.netflix.genie.common.dto.JobExecution;
import com.netflix.genie.common.exceptions.GenieTimeoutException;
import com.netflix.genie.core.events.JobFinishedEvent;
import com.netflix.genie.core.events.JobFinishedReason;
import com.netflix.genie.core.events.KillJobEvent;
import com.netflix.genie.core.properties.JobsProperties;
import com.netflix.genie.core.util.ProcessChecker;
import com.netflix.genie.core.util.UnixProcessChecker;
import com.netflix.genie.web.tasks.GenieTaskScheduleType;
import com.netflix.genie.web.tasks.node.NodeTask;
import com.netflix.spectator.api.Counter;
import com.netflix.spectator.api.Registry;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.exec.ExecuteException;
import org.apache.commons.exec.Executor;
import org.apache.commons.lang3.SystemUtils;
import org.springframework.context.ApplicationEventPublisher;
import org.springframework.context.event.ApplicationEventMulticaster;
import javax.validation.Valid;
import javax.validation.constraints.NotNull;
import java.io.File;
import java.io.IOException;
import java.util.Date;
/**
* Given a process id this class will check if the job client process is running or not.
*
* @author tgianos
* @since 3.0.0
*/
@Slf4j
public class JobMonitor extends NodeTask {
// How many error iterations we can handle
// TODO: Make this a variable
private static final int MAX_ERRORS = 5;
private final String id;
private final JobExecution execution;
private final ProcessChecker processChecker;
private final ApplicationEventPublisher publisher;
private final ApplicationEventMulticaster eventMulticaster;
private final File stdOut;
private final File stdErr;
private final long maxStdOutLength;
private final long maxStdErrLength;
// Metrics
private final Counter successfulCheckRate;
private final Counter timeoutRate;
private final Counter finishedRate;
private final Counter unsuccessfulCheckRate;
private final Counter stdOutTooLarge;
private final Counter stdErrTooLarge;
private int errorCount;
/**
* Constructor.
*
* @param execution The job execution object including the pid
* @param stdOut The std out output file
* @param stdErr The std err output file
* @param executor The process executor to use
* @param publisher The event publisher to use when a job isn't running anymore
* @param eventMulticaster The multicaster to send async events
* @param registry The metrics event registry
* @param jobsProperties The properties for jobs
*/
public JobMonitor(
@Valid final JobExecution execution,
@NotNull final File stdOut,
@NotNull final File stdErr,
@NotNull final Executor executor,
@NotNull final ApplicationEventPublisher publisher,
@NotNull final ApplicationEventMulticaster eventMulticaster,
@NotNull final Registry registry,
@NotNull final JobsProperties jobsProperties
) {
if (!SystemUtils.IS_OS_UNIX) {
throw new UnsupportedOperationException("Genie doesn't currently support " + SystemUtils.OS_NAME);
}
this.errorCount = 0;
this.id = execution.getId().orElseThrow(IllegalArgumentException::new);
this.execution = execution;
this.publisher = publisher;
this.eventMulticaster = eventMulticaster;
final int processId = execution.getProcessId().orElseThrow(IllegalArgumentException::new);
final Date timeout = execution.getTimeout().orElseThrow(IllegalArgumentException::new);
this.processChecker = new UnixProcessChecker(processId, executor, timeout);
this.stdOut = stdOut;
this.stdErr = stdErr;
this.maxStdOutLength = jobsProperties.getMax().getStdOutSize();
this.maxStdErrLength = jobsProperties.getMax().getStdErrSize();
this.successfulCheckRate = registry.counter("genie.jobs.successfulStatusCheck.rate");
this.timeoutRate = registry.counter("genie.jobs.timeout.rate");
this.finishedRate = registry.counter("genie.jobs.finished.rate");
this.unsuccessfulCheckRate = registry.counter("genie.jobs.unsuccessfulStatusCheck.rate");
this.stdOutTooLarge = registry.counter("genie.jobs.stdOutTooLarge.rate");
this.stdErrTooLarge = registry.counter("genie.jobs.stdErrTooLarge.rate");
}
/**
* This will check the process identified by the pid supplied to the constructor. If the pid no longer exists fires
* an event to the system saying the job is done.
*/
@Override
public void run() {
try {
// Blocks until result
this.processChecker.checkProcess();
log.debug("Job {} is still running...", this.id);
if (this.errorCount != 0) {
this.errorCount = 0;
}
if (this.stdOut.exists() && this.stdOut.length() > this.maxStdOutLength) {
this.publisher.publishEvent(new KillJobEvent(this.id, "Std out length exceeded", this));
this.stdOutTooLarge.increment();
return;
}
if (this.stdErr.exists() && this.stdErr.length() > this.maxStdErrLength) {
this.publisher.publishEvent(new KillJobEvent(this.id, "Std err length exceeded", this));
this.stdErrTooLarge.increment();
return;
}
this.successfulCheckRate.increment();
} catch (final GenieTimeoutException gte) {
log.info("Job {} has timed out", this.execution.getId(), gte);
this.timeoutRate.increment();
this.publisher.publishEvent(new KillJobEvent(this.id, "Job exceeded timeout", this));
} catch (final ExecuteException ee) {
log.info("Job {} has finished", this.id);
this.finishedRate.increment();
this.eventMulticaster.multicastEvent(
new JobFinishedEvent(
this.id,
JobFinishedReason.PROCESS_COMPLETED,
"Process detected to be complete",
this
)
);
} catch (final IOException ioe) {
// Some other error
log.error(
"Some IOException happened unable to check process status for pid {}",
this.execution.getProcessId(),
ioe
);
this.errorCount++;
this.unsuccessfulCheckRate.increment();
// If this keeps throwing errors out we should kill the job
if (this.errorCount > MAX_ERRORS) {
// TODO: What if they throw an exception?
this.publisher.publishEvent(
new KillJobEvent(
this.id,
"Couldn't check process status " + MAX_ERRORS + " consecutive times",
this
)
);
// Also send a job finished event
this.eventMulticaster.multicastEvent(
new JobFinishedEvent(
this.id,
JobFinishedReason.KILLED,
"Couldn't check process status " + MAX_ERRORS + " consecutive times",
this
)
);
}
}
}
/**
* {@inheritDoc}
*/
@Override
public GenieTaskScheduleType getScheduleType() {
return GenieTaskScheduleType.FIXED_DELAY;
}
/**
* {@inheritDoc}
*/
@Override
public long getFixedDelay() {
return this.execution.getCheckDelay().orElseThrow(IllegalArgumentException::new);
}
}