package ee.telekom.workflow.executor.lifecycle;
import java.lang.invoke.MethodHandles;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import ee.telekom.workflow.core.common.WorkflowEngineConfiguration;
import ee.telekom.workflow.core.node.NodeService;
import ee.telekom.workflow.core.recovery.RecoveryService;
import ee.telekom.workflow.executor.producer.WorkProducerJob;
import ee.telekom.workflow.executor.queue.WorkQueue;
@Component
public class HealthCheckServiceImpl implements HealthCheckService{
private static final Logger log = LoggerFactory.getLogger( MethodHandles.lookup().lookupClass() );
@Autowired
private RecoveryService recoveryService;
@Autowired
private NodeService nodeService;
@Autowired
private WorkProducerJob producer;
@Autowired
private WorkQueue queue;
@Autowired
private LifecycleService lifecycleService;
@Autowired
private WorkflowEngineConfiguration config;
/**
* "Cluster healing" is meant to repair inconsistent database state that results from an unclean cluster shutdown or network hardware failure.
* Inconsistent database state results from non-transactional operations in the cluster, such as adding/taking work units to/from the work unit
* queue. Let's recall the possible states of (locked, nodeName) of a work instance and how they relate to the non-transactional operations.
* <ol>
* <li>After being a added to the work unit queue -> (true, null)
* <li>After being taken from the work unit queue -> (true, 'the consuming node's name)
* <li>After the work unit is successfully -> (false, null).
* </ol>
* <p>
* <b>Scenario 1:</b> A node takes a work unit from the queue and updates the associated workflow instance node_name field. Maybe it also updates
* the workflow instance's or work item's status field. Afterwards the node is found dead/failed.<br>
* <b>Scenario 2:</b> The master node locks a workflow instance but fails before it can add the workflow instance to the work unit queue.<br>
* <p>
* <b>Resolution for scenario 1:</b><br>
* <ol>
* <li>If the node updated the workflow instance's or the work item's status field, then this field is left in the temporary execution status
* and needs to be recovered. For workflow instances: STARTING->NEW or ABORTING->ABORT. For work items: EXECUTING->(is task ? NEW : ERROR)
* or COMPLETING->EXECUTED.
* <li>The fields (locked, nodeName) need to be set to (false, null).
* </ol>
* NB! EXECUTING task work items cannot be automatically recovered, since their execution is not guaranteed to be transactional. Therefore,
* they need to be handled manually. To this end, their status field is set to EXECUTING_ERROR and an error message is created in the EXER table.
* <p>
* <b>Resolution for scenario 2:</b><br>
* First of all, we need to make sure that the working queue is empty and that every consumer has had sufficient time to assign its most recently
* taken work unit to itself. To this extend, every consumer is granted the so called "maximum node assignment time."
* <p>
* Two different kind of errors may cause the work unit queue to be empty for at least the duration of the maximum node assignment and at the same
* time (locked, nodeName) = (true, null) exist. The first kind is that the distributed queue failed. The second kind is that the node which took
* the element from the queue failed between taking the element and assigning the process execution to itself.
* <p>
* The recovery of this scenario is expensive because we need to suspend the producer, wait for the queue to become empty, wait for
* the maximum node assignment grace period, do the* recovery and resume the producer. For this reason, this advanced recovery is not
* run on every health check.
*/
@Override
public void healFailedNodes(){
List<String> nodes = nodeService.findFailedNodes();
if( nodes.isEmpty() ){
return;
}
log.info( "Healing nodes " + nodes );
// recovery of locked workflow instances that are assigned to a dead node
recoveryService.recoverExecutionsAssignedToNodes( nodes );
// recovery of locked workflow instances that are NOT assigned to any node
boolean isStarted = lifecycleService.isStarted();
if( isStarted ){
producer.suspend();
queue.awaitEmpty();
int maximumNodeAssignementTime = config.getMaximumNodeAssignmentTimeSeconds();
sleep( maximumNodeAssignementTime );
}
String clusterName = config.getClusterName();
recoveryService.recoverExecutionsNotAssignedToNodes( clusterName );
if( isStarted ){
producer.resume();
}
nodeService.markEnable( nodes );
}
private void sleep( int seconds ){
try{
TimeUnit.SECONDS.sleep( seconds );
}
catch( InterruptedException e ){
log.warn( "Wake up after interrupt exception" );
}
}
}