WorkerCrawlerActor.java example

Explorer
webofneeds-master
- webofneeds
package won.matcher.service.crawler.actor;

import akka.actor.ActorRef;
import akka.actor.UntypedActor;
import akka.cluster.pubsub.DistributedPubSub;
import akka.cluster.pubsub.DistributedPubSubMediator;
import akka.event.Logging;
import akka.event.LoggingAdapter;
import org.apache.jena.query.Dataset;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Component;
import org.springframework.web.client.RestClientException;
import won.matcher.service.common.event.NeedEvent;
import won.matcher.service.common.service.sparql.SparqlService;
import won.matcher.service.crawler.config.CrawlConfig;
import won.matcher.service.crawler.exception.CrawlWrapperException;
import won.matcher.service.crawler.msg.CrawlUriMessage;
import won.matcher.service.crawler.msg.ResourceCrawlUriMessage;
import won.matcher.service.crawler.service.CrawlSparqlService;
import won.protocol.exception.DataIntegrityException;
import won.protocol.exception.IncorrectPropertyCountException;
import won.protocol.model.NeedState;
import won.protocol.util.NeedModelWrapper;
import won.protocol.util.RdfUtils;
import won.protocol.util.linkeddata.LinkedDataSource;
import won.protocol.vocabulary.WON;

import java.net.URI;
import java.util.Set;

/**
 * Actor requests linked data URI using HTTP and saves it to a triple store using SPARQL UPDATE query.
 * Responds to the sender with extracted URIs from the linked data URI.
 *
 * This class uses property paths to extract URIs from linked data resources. These property paths are executed
 * relative to base URIs. Therefore there are two types of property paths. Base property path extract URIs that are
 * taken as new base URIs. Non-base property paths extract URIs that keep the current base URI.
 *
 * User: hfriedrich
 * Date: 07.04.2015
 */
@Component
@Scope("prototype")
public class WorkerCrawlerActor extends UntypedActor
{
  private LoggingAdapter log = Logging.getLogger(getContext().system(), this);

  @Autowired
  private LinkedDataSource linkedDataSource;

  @Autowired
  private CrawlSparqlService sparqlService;

  @Autowired
  private CrawlConfig config;

  private ActorRef pubSubMediator;

  @Override
  public void preStart() {

    // initialize the distributed event bus to send need events to the matchers
    pubSubMediator = DistributedPubSub.get(getContext().system()).mediator();
  }

  /**
   * Receives messages with an URI and processes them by requesting the resource,
   * saving it to a triple store, extracting URIs from content and answering the sender.
   *
   * @param msg if type is {@link CrawlUriMessage} then process it
   */
  @Override
  public void onReceive(Object msg) throws RestClientException {

    if (!(msg instanceof CrawlUriMessage)) {
      unhandled(msg);
      return;
    }

    CrawlUriMessage uriMsg = (CrawlUriMessage) msg;
    if (!uriMsg.getStatus().equals(CrawlUriMessage.STATUS.PROCESS) &&
      !uriMsg.getStatus().equals(CrawlUriMessage.STATUS.SAVE)) {
      unhandled(msg);
      return;
    }

    // URI message to process received
    // start the crawling request
    Dataset ds = null;

    // check if resource is already downloaded
    if (uriMsg instanceof ResourceCrawlUriMessage) {
      ResourceCrawlUriMessage resMsg = ((ResourceCrawlUriMessage) uriMsg);
      if (resMsg.getSerializedResource() != null && resMsg.getSerializationFormat() != null) {
        try {
          // TODO: this should be optimized, why deserialize the resource here when we just want to save it in the RDF
          // store? How to insert this serialized resource into the SPARQL endpoint?
          ds = SparqlService.deserializeDataset(resMsg.getSerializedResource(), resMsg.getSerializationFormat());
        } catch (Exception e) {
          throw new CrawlWrapperException(e, uriMsg);
        }
      }
    }

    // download resource if not already downloaded
    if (ds == null) {
      try {
        ds = linkedDataSource.getDataForResource(URI.create(uriMsg.getUri()));
      } catch (RestClientException e) {
        throw new CrawlWrapperException(e, uriMsg);
      }
    }

    // Save dataset to triple store
    sparqlService.updateNamedGraphsOfDataset(ds);
    String wonNodeUri = extractWonNodeUri(ds, uriMsg.getUri());
    if (wonNodeUri == null) {
      wonNodeUri = uriMsg.getWonNodeUri();
    }

    // do nothing more here if the STATUS of the message was SAVE
    if (uriMsg.getStatus().equals(CrawlUriMessage.STATUS.SAVE)) {
      log.debug("processed crawl uri event {} with status 'SAVE'", uriMsg);
      return;
    }

    // send extracted non-base URIs back to sender and save meta data about crawling the URI
    // extract only uris which were crawled at least one recrawl interval ago
    long crawlDate = System.currentTimeMillis();
    log.debug("Extract non-base URIs from message {}", uriMsg);
    Set<String> extractedURIs = sparqlService.extractURIs(
      uriMsg.getUri(), uriMsg.getBaseUri(), config.getCrawlNonBasePropertyPaths(),
      crawlDate - config.getRecrawlIntervalDuration().toMillis());
    for (String extractedURI : extractedURIs) {
      CrawlUriMessage newUriMsg = new CrawlUriMessage(
        extractedURI, uriMsg.getBaseUri(), wonNodeUri, CrawlUriMessage.STATUS.PROCESS, crawlDate);
      getSender().tell(newUriMsg, getSelf());
    }

    // send extracted base URIs back to sender and save meta data about crawling the URI
    // extract only uris which were crawled at least one recrawl interval ago
    log.debug("Extract base URIs from message {}", uriMsg);
    extractedURIs = sparqlService.extractURIs(uriMsg.getUri(), uriMsg.getBaseUri(), config.getCrawlBasePropertyPaths(),
      crawlDate - config.getRecrawlIntervalDuration().toMillis());
    for (String extractedURI : extractedURIs) {
      CrawlUriMessage newUriMsg = new CrawlUriMessage(
        extractedURI, extractedURI, wonNodeUri, CrawlUriMessage.STATUS.PROCESS, crawlDate);
      getSender().tell(newUriMsg, getSelf());
    }

    // signal sender that this URI is processed and save meta data about crawling the URI.
    // This needs to be done after all extracted URI messages have been sent to guarantee consistency
    // in case of failure
    crawlDate = System.currentTimeMillis();
    CrawlUriMessage uriDoneMsg = new CrawlUriMessage(
      uriMsg.getUri(), uriMsg.getBaseUri(), wonNodeUri, CrawlUriMessage.STATUS.DONE, crawlDate);
    log.info("Crawling done for URI {}", uriDoneMsg.getUri());
    getSender().tell(uriDoneMsg, getSelf());

    // if this URI/dataset was a need then send an event to the distributed event bus
    NeedModelWrapper needModelWrapper;
    try {

      // only send active needs right now
      needModelWrapper = new NeedModelWrapper(ds);
      NeedState state = needModelWrapper.getNeedState();
      if (state.equals(NeedState.ACTIVE)) {

        log.debug("Created need event for need uri {}", uriMsg.getUri());
        NeedEvent.TYPE type = NeedEvent.TYPE.CREATED;
        NeedEvent needEvent = new NeedEvent(uriMsg.getUri(), wonNodeUri, type, crawlDate, ds);
        pubSubMediator
          .tell(new DistributedPubSubMediator.Publish(needEvent.getClass().getName(), needEvent), getSelf());
      }

    } catch (DataIntegrityException e) {
      log.debug("no valid need model found in dataset for uri {}", uriMsg.getUri());
    }
  }

  /**
   * Extract won node uri from a won resource
   *
   * @param ds resource as dataset
   * @param uri uri that represents resource
   * @return won node uri or null if link to won node is not linked in the resource
   */
  private String extractWonNodeUri(Dataset ds, String uri) {
    try {
      return RdfUtils.findOnePropertyFromResource(ds, URI.create(uri), WON.HAS_WON_NODE).asResource().getURI();
    } catch (IncorrectPropertyCountException e) {
      return null;
    }
  }

  public void setSparqlService(final CrawlSparqlService sparqlService) {
    this.sparqlService = sparqlService;
  }
}