package me.hao0.antares.store.support; import com.alibaba.fastjson.JSON; import com.google.common.base.Objects; import com.google.common.base.Predicates; import com.google.common.base.Strings; import com.google.common.base.Throwables; import me.hao0.antares.common.dto.JobDetail; import me.hao0.antares.common.dto.JobFireTime; import me.hao0.antares.common.dto.JobInstanceWaitResp; import me.hao0.antares.common.dto.ShardFinishDto; import me.hao0.antares.common.exception.JobStateTransferInvalidException; import me.hao0.antares.common.log.Logs; import me.hao0.antares.common.model.App; import me.hao0.antares.common.model.Job; import me.hao0.antares.common.model.JobInstance; import me.hao0.antares.common.model.enums.JobInstanceShardStatus; import me.hao0.antares.common.model.enums.JobInstanceStatus; import me.hao0.antares.common.model.enums.JobState; import me.hao0.antares.common.retry.Retryer; import me.hao0.antares.common.retry.Retryers; import me.hao0.antares.common.support.SimpleJobStateMachine; import me.hao0.antares.common.util.*; import me.hao0.antares.common.zk.Lock; import me.hao0.antares.common.zk.NodeListener; import me.hao0.antares.common.zk.NodeWatcher; import me.hao0.antares.store.dao.*; import org.springframework.beans.factory.DisposableBean; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import java.util.Date; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; /** * Job support * Author: haolin * Email: haolin.h0@gmail.com */ @Component public class JobSupport implements DisposableBean { @Autowired private AntaresZkClient zk; @Autowired private AppDao appDao; @Autowired private JobDao jobDao; @Autowired private JobInstanceDao jobInstanceDao; @Autowired private JobInstanceShardDao jobInstanceShardDao; private final ExecutorService executor; /** * The retryer for checking job instance finish or not */ private final Retryer<Boolean> checkJobInstanceFinishRetryer = Retryers.get().newRetryer(Predicates.<Boolean>alwaysFalse(), 5); public JobSupport(){ executor = Executors.newExecutor(Systems.cpuNum(), 10000, "JOB-FINISH-CHECKER-"); } /** * Trigger the job instance * @param appName the app name * @param jobClass the job class * @param instance the instance */ public void triggerJobInstance(String appName, String jobClass, JobInstance instance) { String jobInstancePath = ZkPaths.pathOfJobInstance(appName, jobClass, instance.getId()); zk.client().create(jobInstancePath, instance.getStatus()); } /** * Waiting the job instance finished * @param appName the app name * @param jobClass the job class * @param timeout the timeout seconds to waiting the job instance finished * @param jobInstanceId the job instance id * @return return the job instance wait response */ public JobInstanceWaitResp waitingJobInstanceFinish(final String appName, final String jobClass, final Long jobInstanceId, long timeout) { final CountDownLatch latch = new CountDownLatch(1); String jobInstanceNode = ZkPaths.pathOfJobInstance(appName, jobClass, jobInstanceId); NodeWatcher watcher = zk.client().newNodeWatcher(jobInstanceNode, new NodeListener() { @Override public void onDelete() { // the job instance has finished latch.countDown(); } }); try { Logs.info("Waiting the job({}/{}/{}) with timeout({}) to be finished.", appName, jobClass, jobInstanceId, timeout); if (timeout > 0L){ // need take in account the timeout if (!latch.await(timeout, TimeUnit.SECONDS)){ return JobInstanceWaitResp.timeout(); } } else { // no need take in account the timeout latch.await(); } } catch (InterruptedException e) { // occur error throw new RuntimeException(e); } finally { if (watcher != null){ watcher.stop(); } } Logs.info("The job({}/{}/{}) has finished.", appName, jobClass, jobInstanceId); return JobInstanceWaitResp.success(); } /** * Delete the job instance from zk * @param appName the app name * @param jobClass the job class * @param instance the job instance * @return return true if finished the job instance, or false */ public Boolean deleteJobInstance(final String appName, final String jobClass, final JobInstance instance){ return deleteJobInstance(appName, jobClass, instance.getId()); } /** * Delete the job instance from zk * @param appName the app name * @param jobClass the job class * @param jobInstanceId the job instance id * @return return true if finished the job instance, or false */ public Boolean deleteJobInstance(final String appName, final String jobClass, final Long jobInstanceId){ // delete the job instance String jobInstanceNode = ZkPaths.pathOfJobInstance(appName, jobClass, jobInstanceId); zk.client().deleteIfExists(jobInstanceNode); return Boolean.TRUE; } /** * Delete all the instances of the job * @param appName the app name * @param jobClass the job class * @return return true if delete successfully, or false */ public Boolean deleteJobInstances(String appName, String jobClass) { List<String> instanceIds = findJobInstances(appName, jobClass); if (!CollectionUtil.isNullOrEmpty(instanceIds)){ for (String instanceId : instanceIds){ deleteJobInstance(appName, jobClass, Long.valueOf(instanceId)); } } return Boolean.TRUE; } /** * Find the job instance ids of the job * @param appName the app name * @param jobClass the job class * @return the job instance ids */ public List<String> findJobInstances(String appName, String jobClass){ String jobInstancesNode = ZkPaths.pathOfJobInstances(appName, jobClass); return zk.client().gets(jobInstancesNode); } /** * Update the job fire time info * @param appName the app name * @param jobClass the job class * @param jobFireTime the job fire time * @return return true if update successfully, or false */ public Boolean updateJobFireTime(String appName, String jobClass, JobFireTime jobFireTime) { String jobFireTimeNode = ZkPaths.pathOfJobFireTime(appName, jobClass); zk.client().mkdirs(jobFireTimeNode); return zk.client().update(jobFireTimeNode, JSON.toJSONString(jobFireTime)); } /** * Get the job fire time info * @param appName the app name * @param jobClass the job class * @return the job fire time info */ public JobFireTime getJobFireTime(String appName, String jobClass){ String jobFireTimeNode = ZkPaths.pathOfJobFireTime(appName, jobClass); if (!zk.client().checkExists(jobFireTimeNode)){ return null; } return zk.client().getJson(jobFireTimeNode, JobFireTime.class); } /** * Update the job running state directly * @param appName the app name * @param jobClass the job class * @param state the target state * @return return true if update successfully, or false */ public Boolean updateJobStateDirectly(String appName, String jobClass, JobState state){ String jobStateNode = ZkPaths.pathOfJobState(appName, jobClass); zk.client().mkdirs(jobStateNode); return zk.client().update(jobStateNode, state.value()); } /** * Update the job running state safely, will be constrained by statemachine * @param appName the app name * @param jobClass the job class * @param targetState the new state * @return return true if update successfully, or throw JobStateTransferInvalidException * @see SimpleJobStateMachine * @see JobStateTransferInvalidException */ public Boolean updateJobStateSafely(String appName, String jobClass, JobState targetState){ JobState currentState = getJobState(appName, jobClass); if(!SimpleJobStateMachine.get().allow(currentState, targetState)){ throw new JobStateTransferInvalidException(appName + "/" + jobClass, currentState, targetState); } String jobStateNode = ZkPaths.pathOfJobState(appName, jobClass); return zk.client().update(jobStateNode, targetState.value()); } /** * Check the job state operate valid or not * @param appName the app name * @param jobClass the job class * @param expectState the expect state * @param targetState the new state * @see JobStateTransferInvalidException */ public void checkJobStateOperate(String appName, String jobClass, JobState expectState, JobState targetState){ JobState currentState = getJobState(appName, jobClass); if ((expectState != null && expectState != currentState) || !SimpleJobStateMachine.get().allow(currentState, targetState)){ throw new JobStateTransferInvalidException(appName + "/" + jobClass, currentState, targetState); } } /** * Get the job state * @param appName the app name * @param jobClass the job class * @return the job state */ public JobState getJobState(String appName, String jobClass) { String jobStateNode = ZkPaths.pathOfJobState(appName, jobClass); if (!zk.client().checkExists(jobStateNode)){ return JobState.STOPPED; } return JobState.from(zk.client().getInteger(jobStateNode)); } /** * Update the job's scheudler * @param appName the app name * @param jobClass the job class * @param scheduler the scheduler * @return return true if update successfully, or false */ public Boolean updateJobScheduler(String appName, String jobClass, String scheduler) { String jobSchedulerNode = ZkPaths.pathOfJobScheduler(appName, jobClass); zk.client().mkdirs(jobSchedulerNode); return zk.client().update(jobSchedulerNode, scheduler); } /** * Get the job scheduler * @param appName the app name * @param jobClass the job class * @return the job scheduler */ public String getJobScheduler(String appName, String jobClass) { String jobSchedulerNode = ZkPaths.pathOfJobScheduler(appName, jobClass); if (!zk.client().checkExists(jobSchedulerNode)){ return null; } return zk.client().getString(jobSchedulerNode); } /** * Make the job instances node * @param appName the app name * @param jobClass the job class * @return return true if make successfully, or false */ public Boolean mkJobInstances(String appName, String jobClass) { return zk.client().mkdirs(ZkPaths.pathOfJobInstances(appName, jobClass)); } /** * Remove the job from zk * @param jobDetail the job detail * @return return true if remove successfully, or false */ public Boolean removeJob(JobDetail jobDetail){ String appJobPath = ZkPaths.pathOfJob(jobDetail.getApp().getAppName(), jobDetail.getJob().getClazz()); zk.client().deleteRecursivelyIfExists(appJobPath); return Boolean.TRUE; } /** * Checking the job is scheduling or not * @param appName the app name * @param jobClass the job class * @return return true if the job is scheduling, or false */ public Boolean checkJobScheduling(String appName, String jobClass) { String jobPath = ZkPaths.pathOfJob(appName, jobClass); if(!zk.client().checkExists(jobPath)){ return Boolean.FALSE; } String scheduler = getJobScheduler(appName, jobClass); if(Strings.isNullOrEmpty(scheduler)){ // The scheduler is empty return Boolean.FALSE; } if(!zk.client().checkExists(ZkPaths.pathOfServer(scheduler))){ // The scheduler server offline return Boolean.FALSE; } return Boolean.TRUE; } /** * Check the job instance finish or not * @param shardFinishDto the shard finish dto */ public void checkJobInstanceFinish(final ShardFinishDto shardFinishDto){ executor.submit(new Runnable() { @Override public void run() { try { checkJobInstanceFinishRetryer.call(new RetryableCheckJobInstanceFinishTask(shardFinishDto)); // doCheckJobInstanceFinish(shardFinishDto); } catch (Exception e) { Logs.error("failed to check job instance finish({}), cause: {}", shardFinishDto, Throwables.getStackTraceAsString(e)); } } }); } /** * Check the job has one running job instance * @param appName the app name * @param jobClass the job class * @return return true if has one running job instance, or false */ public boolean hasJobInstance(String appName, String jobClass) { String jobInstanceNodePath = ZkPaths.pathOfJobInstances(appName, jobClass); List<String> instances = zk.client().gets(jobInstanceNodePath); return !CollectionUtil.isNullOrEmpty(instances); } /** * Force to stop the current job instance * @param jobDetail the job detail * @param finalStatus the final status of the job instance */ public Boolean forceStopJobInstance(JobDetail jobDetail, JobInstanceStatus finalStatus) { List<String> jobInstanceIds = findJobInstances(jobDetail.getApp().getAppName(), jobDetail.getJob().getClazz()); if (!CollectionUtil.isNullOrEmpty(jobInstanceIds)){ for (String jobInstanceId : jobInstanceIds){ forceStopJobInstance(jobDetail, Long.valueOf(jobInstanceId), finalStatus); } } return Boolean.TRUE; } /** * Force to stop the current job instance * @param jobDetail the job detail * @param jobInstanceId the job instance id * @param finalStatus the final status * @see JobInstanceStatus */ private void forceStopJobInstance(JobDetail jobDetail, Long jobInstanceId, JobInstanceStatus finalStatus) { // lock the job instance // avoid the job instance finished before Lock jobInstanceLock = lockJobInstance(jobInstanceId); while (!jobInstanceLock.lock(5000)){ // lock timeout Logs.warn("failed to lock the job instance when force stop job instance(jobDetail={}, jobInstanceId={}).", jobDetail, jobInstanceId); } try { JobInstance instance = jobInstanceDao.findById(jobInstanceId); if (JobInstanceStatus.isFinal(instance.getStatus())){ // job instance is final return; } // try to delete the job instance from zk String appName = jobDetail.getApp().getAppName(); String jobClass = jobDetail.getJob().getClazz(); if(!deleteJobInstance(appName, jobClass, instance)){ Logs.warn("failed to delete job instance from zk when force stop job instance((jobDetail={}, jobInstance={})).", instance, jobDetail); } instance.setUtime(new Date()); instance.setStatus(finalStatus.value()); jobInstanceDao.save(instance); updateJobStateSafely(appName, jobClass, JobState.WAITING); } catch (Exception e){ Logs.error("failed to force stop job instance(jobDetail={}, jobInstanceId={}), cause: {}", jobDetail, jobInstanceId, Throwables.getStackTraceAsString(e)); } finally { jobInstanceLock.unlock(); } } /** * The retry task for check job instance finish */ private class RetryableCheckJobInstanceFinishTask implements Callable<Boolean> { private final ShardFinishDto shardFinishDto; public RetryableCheckJobInstanceFinishTask(ShardFinishDto shardFinishDto) { this.shardFinishDto = shardFinishDto; } @Override public Boolean call() throws Exception { return doCheckJobInstanceFinish(shardFinishDto); } } /** * Check whether the job instance has finished or not * @param shardFinishDto the shard finish dto * @return return true if check successfully, or false */ private Boolean doCheckJobInstanceFinish(ShardFinishDto shardFinishDto) { Long instanceId = shardFinishDto.getInstanceId(); // loop lock // avoid the locked server crashed before finishing the job instance Lock jobInstanceLock = lockJobInstance(instanceId); while (!jobInstanceLock.lock(5000)){ // lock timeout Logs.warn("failed to lock the job instance(id={}) when check job instance finish, will retry", instanceId); } // try/catch doesn't impact on the shard finished try { JobInstance instance = jobInstanceDao.findById(instanceId); if (JobInstanceStatus.isFinal(instance.getStatus())){ // job instance is final return Boolean.TRUE; } // whether all shards are finished Integer totalShardCount = jobInstanceShardDao.getJobInstanceTotalShardCount(instanceId); Integer successShardCount = jobInstanceShardDao.getJobInstanceStatusShardCount(instanceId, JobInstanceShardStatus.SUCCESS); Integer failedShardCount = jobInstanceShardDao.getJobInstanceStatusShardCount(instanceId, JobInstanceShardStatus.FAILED); if (Objects.equal(totalShardCount, successShardCount + failedShardCount)){ // try to delete the job instance from zk Job job = jobDao.findById(instance.getJobId()); App app = appDao.findById(job.getAppId()); if(!deleteJobInstance(app.getAppName(), job.getClazz(), instance)){ Logs.warn("failed to delete job instance({}) from zk when check the job instance finish.", instance); } // update the job instance instance.setEndTime(shardFinishDto.getEndTime()); if (failedShardCount > 0){ // there are shards failed instance.setStatus(JobInstanceStatus.FAILED.value()); } else { // all shards success instance.setStatus(JobInstanceStatus.SUCCESS.value()); } instance.setUtime(new Date()); return jobInstanceDao.save(instance); } return Boolean.TRUE; } catch (Exception e){ Logs.error("failed to check whether the job instance(id={}) has finished, cause: {}", instanceId, Throwables.getStackTraceAsString(e)); return Boolean.FALSE; } finally { jobInstanceLock.unlock(); } } /** * Lock the job instance check finish lock * @param jobInstanceId the job instance id * @return the lock */ private Lock lockJobInstance(Long jobInstanceId){ return zk.client().newLock(ZkPaths.pathOfJobInstanceLock(jobInstanceId)); } @Override public void destroy() throws Exception { executor.shutdown(); } }