package won.matcher.service.crawler.actor;
import akka.actor.ActorRef;
import akka.actor.OneForOneStrategy;
import akka.actor.SupervisorStrategy;
import akka.actor.UntypedActor;
import akka.cluster.pubsub.DistributedPubSub;
import akka.cluster.pubsub.DistributedPubSubMediator;
import akka.event.Logging;
import akka.event.LoggingAdapter;
import akka.japi.Function;
import won.matcher.service.common.event.WonNodeEvent;
import won.matcher.service.common.spring.SpringExtension;
import won.matcher.service.crawler.config.CrawlConfig;
import won.matcher.service.crawler.exception.CrawlWrapperException;
import won.matcher.service.crawler.msg.CrawlUriMessage;
import won.matcher.service.crawler.msg.ResourceCrawlUriMessage;
import won.matcher.service.crawler.service.CrawlSparqlService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Component;
import scala.concurrent.duration.Duration;
import scala.concurrent.duration.FiniteDuration;
import won.protocol.service.WonNodeInfo;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
/**
* Coordinates recursive crawling of linked data resources by assigning {@link CrawlUriMessage}
* to workers {@link WorkerCrawlerActor} and one single worker of type {@link UpdateMetadataActor}.
* The process can be stopped at any time and continued by passing the messages that
* should be crawled again since meta data about the crawling process is saved
* in the SPARQL endpoint. This is done by a single actor of type {@link UpdateMetadataActor}
* which keeps message order to guarantee consistency in case of failure. Unfinished messages can
* be resend for restarting crawling.
* Newly discovered won node events are published on the event stream during crawling.
* When an event is received that indicates that we connected to that won node, crawling
* this won node can continue and will be triggered regularly by
* {@link won.matcher.service.nodemanager.actor.WonNodeControllerActor}.
*
* User: hfriedrich
* Date: 30.03.2015
*/
@Component
@Scope("prototype")
public class MasterCrawlerActor extends UntypedActor
{
private LoggingAdapter log = Logging.getLogger(getContext().system(), this);
private static final FiniteDuration RESCHEDULE_MESSAGE_DURATION = Duration.create(500, TimeUnit.MILLISECONDS);
private Map<String, CrawlUriMessage> pendingMessages = new HashMap<>();
private Map<String, CrawlUriMessage> doneMessages = new HashMap<>();
private Map<String, CrawlUriMessage> failedMessages = new HashMap<>();
private Set<String> crawlWonNodeUris = new HashSet<>();
private Set<String> skipWonNodeUris = new HashSet<>();
private ActorRef crawlingWorker;
private ActorRef updateMetaDataWorker;
private ActorRef pubSubMediator;
private static final String RECRAWL_TICK = "recrawl_tick";
private static final int MIN_PENDING_MESSAGES_TO_SKIP_RECRAWLING = 10;
@Autowired
private CrawlConfig config;
@Autowired
private CrawlSparqlService sparqlService;
@Override
public void preStart() {
// Create a scheduler to execute the life check for each won node regularly
getContext().system().scheduler().schedule(config.getRecrawlIntervalDuration(), config.getRecrawlIntervalDuration(),
getSelf(), RECRAWL_TICK, getContext().dispatcher(), null);
// Create the router/pool with worker actors that do the actual crawling
crawlingWorker = getContext().actorOf(SpringExtension.SpringExtProvider.get(getContext().system()).fromConfigProps(
WorkerCrawlerActor.class), "CrawlingRouter");
// create a single meta data update actor for all worker actors
updateMetaDataWorker = getContext().actorOf(SpringExtension.SpringExtProvider.get(getContext().system()).props(
UpdateMetadataActor.class), "MetaDataUpdateWorker");
getContext().watch(updateMetaDataWorker);
// create an need loading actor
getContext().actorOf(SpringExtension.SpringExtProvider.get(getContext().system()).props(
NeedEventLoaderActor.class), "NeedEventLoader");
// subscribe for won node events
pubSubMediator = DistributedPubSub.get(getContext().system()).mediator();
pubSubMediator.tell(new DistributedPubSubMediator.Subscribe(WonNodeEvent.class.getName(), getSelf()), getSelf());
// subscribe to crawl events
pubSubMediator.tell(new DistributedPubSubMediator.Subscribe(CrawlUriMessage.class.getName(), getSelf()), getSelf());
pubSubMediator.tell(new DistributedPubSubMediator.Subscribe(
ResourceCrawlUriMessage.class.getName(), getSelf()), getSelf());
// load the unfinished uris and start crawling
for (CrawlUriMessage msg : sparqlService.retrieveMessagesForCrawling(CrawlUriMessage.STATUS.PROCESS)) {
pendingMessages.put(msg.getUri(), msg);
crawlingWorker.tell(msg, getSelf());
}
for (CrawlUriMessage msg : sparqlService.retrieveMessagesForCrawling(CrawlUriMessage.STATUS.FAILED)) {
getSelf().tell(msg, getSelf());
}
}
/**
* set supervision strategy for worker actors and handle failed crawling actions
*
* @return
*/
@Override
public SupervisorStrategy supervisorStrategy() {
SupervisorStrategy supervisorStrategy = new OneForOneStrategy(
0, Duration.Zero(), new Function<Throwable, SupervisorStrategy.Directive>()
{
@Override
public SupervisorStrategy.Directive apply(Throwable t) throws Exception {
log.warning("Actor encountered error: {}", t);
// save the failed status of a crawlingWorker during crawling
if (t instanceof CrawlWrapperException) {
CrawlWrapperException e = (CrawlWrapperException) t;
log.warning("Handled breaking message: {}", e.getBreakingMessage());
log.warning("Exception was: {}", e.getException());
processCrawlUriMessage(e.getBreakingMessage());
return SupervisorStrategy.resume();
}
// default behaviour in other cases
return SupervisorStrategy.escalate();
}
});
return supervisorStrategy;
}
/**
* Process {@link won.matcher.service.crawler.msg.CrawlUriMessage} objects
*
* @param message
*/
@Override
public void onReceive(final Object message) {
if (message.equals(RECRAWL_TICK)) {
askWonNodeInfoForCrawling();
} else if (message instanceof WonNodeEvent) {
processWonNodeEvent((WonNodeEvent) message);
} else if (message instanceof CrawlUriMessage) {
CrawlUriMessage uriMsg = (CrawlUriMessage) message;
processCrawlUriMessage(uriMsg);
log.debug("Number of pending messages: {}", pendingMessages.size());
} else {
unhandled(message);
}
}
private void logStatus() {
log.info("Number of URIs\n Crawled: {}\n Failed: {}\n Pending: {}",
doneMessages.size(), failedMessages.size(), pendingMessages.size());
}
private boolean discoveredNewWonNode(String uri) {
if (uri == null || uri.isEmpty() || crawlWonNodeUris.contains(uri) || skipWonNodeUris.contains(uri)) {
return false;
}
return true;
}
/**
* Pass the messages to process to the workers and update meta data about crawling.
* Also create an event if a new won node is discovered.
*
* @param msg
*/
private void processCrawlUriMessage(CrawlUriMessage msg) {
log.debug("Process message: {}", msg);
if (msg.getStatus().equals(CrawlUriMessage.STATUS.PROCESS) || msg.getStatus().equals(CrawlUriMessage.STATUS.SAVE)) {
// multiple extractions of the same URI can happen quite often since the extraction
// query uses property path from base URI which may return URIs that are already
// processed. So filter out these messages here
if (pendingMessages.get(msg.getUri()) != null ||
doneMessages.get(msg.getUri()) != null ||
failedMessages.get(msg.getUri()) != null) {
log.debug("message {} already processing/processed ...", msg);
return;
}
updateMetaDataWorker.tell(msg, getSelf());
// check if the uri belongs to a known and not skipped won node.
// if so continue crawling, otherwise first publish an event about a newly
// discovered won node and reschedule the processing of the current message until
// we received an answer for the discovered won node event
if (discoveredNewWonNode(msg.getWonNodeUri())) {
log.debug("discovered new won node {}", msg.getWonNodeUri());
WonNodeEvent event = new WonNodeEvent(msg.getWonNodeUri(), WonNodeEvent.STATUS.NEW_WON_NODE_DISCOVERED);
pubSubMediator.tell(new DistributedPubSubMediator.Publish(event.getClass().getName(), event), getSelf());
getContext().system().scheduler().scheduleOnce(
RESCHEDULE_MESSAGE_DURATION, getSelf(), msg, getContext().dispatcher(), null);
} else if (!skipWonNodeUris.contains(msg.getWonNodeUri())) {
pendingMessages.put(msg.getUri(), msg);
crawlingWorker.tell(msg, getSelf());
}
} else if (msg.getStatus().equals(CrawlUriMessage.STATUS.DONE)) {
// URI crawled successfully
log.debug("Successfully processed URI: {}", msg.getUri());
updateMetaDataWorker.tell(msg, getSelf());
pendingMessages.remove(msg.getUri());
if (doneMessages.put(msg.getUri(), msg) != null) {
log.warning("URI message received twice: {}", msg.getUri());
}
logStatus();
} else if (msg.getStatus().equals(CrawlUriMessage.STATUS.FAILED)) {
// Crawling failed
log.debug("Crawling URI failed: {}", msg.getUri());
updateMetaDataWorker.tell(msg, getSelf());
pendingMessages.remove(msg.getUri());
failedMessages.put(msg.getUri(), msg);
logStatus();
}
}
/**
* If events about crawling or skipping certain won nodes occur, keep this information in memory
*
* @param event
*/
private void processWonNodeEvent(WonNodeEvent event) {
if (event.getStatus().equals(WonNodeEvent.STATUS.CONNECTED_TO_WON_NODE)) {
// If we receive a connection event then add the won node to the list of known nodes and start crawling it
log.debug("added new won node to set of crawling won nodes: {}", event.getWonNodeUri());
skipWonNodeUris.remove(event.getWonNodeUri());
crawlWonNodeUris.add(event.getWonNodeUri());
startCrawling(event.getWonNodeInfo());
} else if (event.getStatus().equals(WonNodeEvent.STATUS.SKIP_WON_NODE)) {
// if we should skip this won node remove it from the known won node list and add it to the skip list
log.debug("skip crawling won node: {}", event.getWonNodeUri());
crawlWonNodeUris.remove(event.getWonNodeUri());
skipWonNodeUris.add(event.getWonNodeUri());
}
}
/**
* Ask for complete won node info of all known won nodes on the event bus. Do this to initiate the crawling process
* again. Therefore clear the cache of crawled uris so that they can be crawled again.
*/
private void askWonNodeInfoForCrawling() {
if (pendingMessages.size() > MIN_PENDING_MESSAGES_TO_SKIP_RECRAWLING) {
log.warning("Skip crawling cylce since there are currently {} messages in the pending queue. Try to restart " +
"crawling again in {} minutes", pendingMessages.size(), config.getRecrawlIntervalDuration().toMinutes());
return;
}
log.info("Start crwaling process again. Clear the cached uris and crawling statistics");
doneMessages.clear();
failedMessages.clear();
pendingMessages.clear();
for (String wonNodeUri : crawlWonNodeUris) {
log.info("ask for won node info of {}", wonNodeUri);
WonNodeEvent event = new WonNodeEvent(wonNodeUri, WonNodeEvent.STATUS.GET_WON_NODE_INFO_FOR_CRAWLING);
pubSubMediator.tell(new DistributedPubSubMediator.Publish(event.getClass().getName(), event), getSelf());
}
}
/**
* Start crawling a won node starting at the need list
*
* @param wonNodeInfo
*/
private void startCrawling(WonNodeInfo wonNodeInfo) {
// try crawling with and without ending "/" in need list uri
String needListUri = wonNodeInfo.getNeedListURI();
if (needListUri.endsWith("/")) {
needListUri = needListUri.substring(0, needListUri.length() - 1);
}
self().tell(new CrawlUriMessage(needListUri, needListUri, wonNodeInfo.getWonNodeURI(),
CrawlUriMessage.STATUS.PROCESS, System.currentTimeMillis()), getSelf());
self().tell(new CrawlUriMessage(needListUri + "/", needListUri + "/", wonNodeInfo.getWonNodeURI(),
CrawlUriMessage.STATUS.PROCESS, System.currentTimeMillis()), getSelf());
}
}