MasterCrawlerActor.java example

Explorer
webofneeds-master
- webofneeds
package won.matcher.service.crawler.actor;

import akka.actor.ActorRef;
import akka.actor.OneForOneStrategy;
import akka.actor.SupervisorStrategy;
import akka.actor.UntypedActor;
import akka.cluster.pubsub.DistributedPubSub;
import akka.cluster.pubsub.DistributedPubSubMediator;
import akka.event.Logging;
import akka.event.LoggingAdapter;
import akka.japi.Function;
import won.matcher.service.common.event.WonNodeEvent;
import won.matcher.service.common.spring.SpringExtension;
import won.matcher.service.crawler.config.CrawlConfig;
import won.matcher.service.crawler.exception.CrawlWrapperException;
import won.matcher.service.crawler.msg.CrawlUriMessage;
import won.matcher.service.crawler.msg.ResourceCrawlUriMessage;
import won.matcher.service.crawler.service.CrawlSparqlService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Component;
import scala.concurrent.duration.Duration;
import scala.concurrent.duration.FiniteDuration;
import won.protocol.service.WonNodeInfo;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;

/**
 * Coordinates recursive crawling of linked data resources by assigning {@link CrawlUriMessage}
 * to workers {@link WorkerCrawlerActor} and one single worker of type {@link UpdateMetadataActor}.
 * The process can be stopped at any time and continued by passing the messages that
 * should be crawled again since meta data about the crawling process is saved
 * in the SPARQL endpoint. This is done by a single actor of type {@link UpdateMetadataActor}
 * which keeps message order to guarantee consistency in case of failure. Unfinished messages can
 * be resend for restarting crawling.
 * Newly discovered won node events are published on the event stream during crawling.
 * When an event is received that indicates that we connected to that won node, crawling
 * this won node can continue and will be triggered regularly by
 * {@link won.matcher.service.nodemanager.actor.WonNodeControllerActor}.
 *
 * User: hfriedrich
 * Date: 30.03.2015
 */
@Component
@Scope("prototype")
public class MasterCrawlerActor extends UntypedActor
{
  private LoggingAdapter log = Logging.getLogger(getContext().system(), this);
  private static final FiniteDuration RESCHEDULE_MESSAGE_DURATION = Duration.create(500, TimeUnit.MILLISECONDS);
  private Map<String, CrawlUriMessage> pendingMessages = new HashMap<>();
  private Map<String, CrawlUriMessage> doneMessages = new HashMap<>();
  private Map<String, CrawlUriMessage> failedMessages = new HashMap<>();
  private Set<String> crawlWonNodeUris = new HashSet<>();
  private Set<String> skipWonNodeUris = new HashSet<>();
  private ActorRef crawlingWorker;
  private ActorRef updateMetaDataWorker;
  private ActorRef pubSubMediator;
  private static final String RECRAWL_TICK = "recrawl_tick";
  private static final int MIN_PENDING_MESSAGES_TO_SKIP_RECRAWLING = 10;

  @Autowired
  private CrawlConfig config;

  @Autowired
  private CrawlSparqlService sparqlService;

  @Override
  public void preStart() {

    // Create a scheduler to execute the life check for each won node regularly
    getContext().system().scheduler().schedule(config.getRecrawlIntervalDuration(), config.getRecrawlIntervalDuration(),
                                               getSelf(), RECRAWL_TICK, getContext().dispatcher(), null);

    // Create the router/pool with worker actors that do the actual crawling
    crawlingWorker = getContext().actorOf(SpringExtension.SpringExtProvider.get(getContext().system()).fromConfigProps(
      WorkerCrawlerActor.class), "CrawlingRouter");

    // create a single meta data update actor for all worker actors
    updateMetaDataWorker = getContext().actorOf(SpringExtension.SpringExtProvider.get(getContext().system()).props(
      UpdateMetadataActor.class), "MetaDataUpdateWorker");
    getContext().watch(updateMetaDataWorker);

    // create an need loading actor
    getContext().actorOf(SpringExtension.SpringExtProvider.get(getContext().system()).props(
      NeedEventLoaderActor.class), "NeedEventLoader");

    // subscribe for won node events
    pubSubMediator = DistributedPubSub.get(getContext().system()).mediator();
    pubSubMediator.tell(new DistributedPubSubMediator.Subscribe(WonNodeEvent.class.getName(), getSelf()), getSelf());

    // subscribe to crawl events
    pubSubMediator.tell(new DistributedPubSubMediator.Subscribe(CrawlUriMessage.class.getName(), getSelf()), getSelf());
    pubSubMediator.tell(new DistributedPubSubMediator.Subscribe(
      ResourceCrawlUriMessage.class.getName(), getSelf()), getSelf());

    // load the unfinished uris and start crawling
    for (CrawlUriMessage msg : sparqlService.retrieveMessagesForCrawling(CrawlUriMessage.STATUS.PROCESS)) {
      pendingMessages.put(msg.getUri(), msg);
      crawlingWorker.tell(msg, getSelf());
    }

    for (CrawlUriMessage msg : sparqlService.retrieveMessagesForCrawling(CrawlUriMessage.STATUS.FAILED)) {
      getSelf().tell(msg, getSelf());
    }
  }

  /**
   * set supervision strategy for worker actors and handle failed crawling actions
   *
   * @return
   */
  @Override
  public SupervisorStrategy supervisorStrategy() {

    SupervisorStrategy supervisorStrategy = new OneForOneStrategy(
      0, Duration.Zero(), new Function<Throwable, SupervisorStrategy.Directive>()
    {

      @Override
      public SupervisorStrategy.Directive apply(Throwable t) throws Exception {

        log.warning("Actor encountered error: {}", t);
        // save the failed status of a crawlingWorker during crawling
        if (t instanceof CrawlWrapperException) {
          CrawlWrapperException e = (CrawlWrapperException) t;
          log.warning("Handled breaking message: {}", e.getBreakingMessage());
          log.warning("Exception was: {}", e.getException());
          processCrawlUriMessage(e.getBreakingMessage());
          return SupervisorStrategy.resume();
        }

        // default behaviour in other cases
        return SupervisorStrategy.escalate();
      }
    });

    return supervisorStrategy;
  }

  /**
   * Process {@link won.matcher.service.crawler.msg.CrawlUriMessage} objects
   *
   * @param message
   */
  @Override
  public void onReceive(final Object message) {

    if (message.equals(RECRAWL_TICK)) {
      askWonNodeInfoForCrawling();
    } else if (message instanceof WonNodeEvent) {
      processWonNodeEvent((WonNodeEvent) message);
    } else if (message instanceof CrawlUriMessage) {
      CrawlUriMessage uriMsg = (CrawlUriMessage) message;
      processCrawlUriMessage(uriMsg);
      log.debug("Number of pending messages: {}", pendingMessages.size());
    } else {
      unhandled(message);
    }
  }

  private void logStatus() {
    log.info("Number of URIs\n Crawled: {}\n Failed: {}\n Pending: {}",
             doneMessages.size(), failedMessages.size(), pendingMessages.size());
  }

  private boolean discoveredNewWonNode(String uri) {
    if (uri == null || uri.isEmpty() || crawlWonNodeUris.contains(uri) || skipWonNodeUris.contains(uri)) {
      return false;
    }
    return true;
  }

  /**
   * Pass the messages to process to the workers and update meta data about crawling.
   * Also create an event if a new won node is discovered.
   *
   * @param msg
   */
  private void processCrawlUriMessage(CrawlUriMessage msg) {

    log.debug("Process message: {}", msg);
    if (msg.getStatus().equals(CrawlUriMessage.STATUS.PROCESS) || msg.getStatus().equals(CrawlUriMessage.STATUS.SAVE)) {

      // multiple extractions of the same URI can happen quite often since the extraction
      // query uses property path from base URI which may return URIs that are already
      // processed. So filter out these messages here
      if (pendingMessages.get(msg.getUri()) != null ||
        doneMessages.get(msg.getUri()) != null ||
        failedMessages.get(msg.getUri()) != null) {
        log.debug("message {} already processing/processed ...", msg);
        return;
      }

      updateMetaDataWorker.tell(msg, getSelf());

      // check if the uri belongs to a known and not skipped won node.
      // if so continue crawling, otherwise first publish an event about a newly
      // discovered won node and reschedule the processing of the current message until
      // we received an answer for the discovered won node event
      if (discoveredNewWonNode(msg.getWonNodeUri())) {
        log.debug("discovered new won node {}", msg.getWonNodeUri());
        WonNodeEvent event = new WonNodeEvent(msg.getWonNodeUri(), WonNodeEvent.STATUS.NEW_WON_NODE_DISCOVERED);
        pubSubMediator.tell(new DistributedPubSubMediator.Publish(event.getClass().getName(), event), getSelf());
        getContext().system().scheduler().scheduleOnce(
          RESCHEDULE_MESSAGE_DURATION, getSelf(), msg, getContext().dispatcher(), null);
      } else if (!skipWonNodeUris.contains(msg.getWonNodeUri())) {
        pendingMessages.put(msg.getUri(), msg);
        crawlingWorker.tell(msg, getSelf());
      }

    } else if (msg.getStatus().equals(CrawlUriMessage.STATUS.DONE)) {

      // URI crawled successfully
      log.debug("Successfully processed URI: {}", msg.getUri());
      updateMetaDataWorker.tell(msg, getSelf());
      pendingMessages.remove(msg.getUri());
      if (doneMessages.put(msg.getUri(), msg) != null) {
        log.warning("URI message received twice: {}", msg.getUri());
      }
      logStatus();

    } else if (msg.getStatus().equals(CrawlUriMessage.STATUS.FAILED)) {

      // Crawling failed
      log.debug("Crawling URI failed: {}", msg.getUri());
      updateMetaDataWorker.tell(msg, getSelf());
      pendingMessages.remove(msg.getUri());
      failedMessages.put(msg.getUri(), msg);
      logStatus();
    }
  }

  /**
   * If events about crawling or skipping certain won nodes occur, keep this information in memory
   *
   * @param event
   */
  private void processWonNodeEvent(WonNodeEvent event) {

    if (event.getStatus().equals(WonNodeEvent.STATUS.CONNECTED_TO_WON_NODE)) {

      // If we receive a connection event then add the won node to the list of known nodes and start crawling it
      log.debug("added new won node to set of crawling won nodes: {}", event.getWonNodeUri());
      skipWonNodeUris.remove(event.getWonNodeUri());
      crawlWonNodeUris.add(event.getWonNodeUri());
      startCrawling(event.getWonNodeInfo());

    } else if (event.getStatus().equals(WonNodeEvent.STATUS.SKIP_WON_NODE)) {

      // if we should skip this won node remove it from the known won node list and add it to the skip list
      log.debug("skip crawling won node: {}", event.getWonNodeUri());
      crawlWonNodeUris.remove(event.getWonNodeUri());
      skipWonNodeUris.add(event.getWonNodeUri());
    }
  }

  /**
   * Ask for complete won node info of all known won nodes on the event bus. Do this to initiate the crawling process
   * again. Therefore clear the cache of crawled uris so that they can be crawled again.
   */
  private void askWonNodeInfoForCrawling() {

    if (pendingMessages.size() > MIN_PENDING_MESSAGES_TO_SKIP_RECRAWLING) {
      log.warning("Skip crawling cylce since there are currently {} messages in the pending queue. Try to restart " +
                    "crawling again in {} minutes", pendingMessages.size(), config.getRecrawlIntervalDuration().toMinutes());
      return;
    }

    log.info("Start crwaling process again. Clear the cached uris and crawling statistics");
    doneMessages.clear();
    failedMessages.clear();
    pendingMessages.clear();

    for (String wonNodeUri : crawlWonNodeUris) {
      log.info("ask for won node info of {}", wonNodeUri);
      WonNodeEvent event = new WonNodeEvent(wonNodeUri, WonNodeEvent.STATUS.GET_WON_NODE_INFO_FOR_CRAWLING);
      pubSubMediator.tell(new DistributedPubSubMediator.Publish(event.getClass().getName(), event), getSelf());
    }
  }

  /**
   * Start crawling a won node starting at the need list
   *
   * @param wonNodeInfo
   */
  private void startCrawling(WonNodeInfo wonNodeInfo) {

    // try crawling with and without ending "/" in need list uri
    String needListUri = wonNodeInfo.getNeedListURI();
    if (needListUri.endsWith("/")) {
      needListUri = needListUri.substring(0, needListUri.length() - 1);
    }

    self().tell(new CrawlUriMessage(needListUri, needListUri, wonNodeInfo.getWonNodeURI(),
                                    CrawlUriMessage.STATUS.PROCESS, System.currentTimeMillis()), getSelf());
    self().tell(new CrawlUriMessage(needListUri + "/", needListUri + "/", wonNodeInfo.getWonNodeURI(),
                                    CrawlUriMessage.STATUS.PROCESS, System.currentTimeMillis()), getSelf());
  }


}