package won.matcher.service.crawler.actor;
import akka.actor.ActorRef;
import akka.actor.UntypedActor;
import akka.cluster.pubsub.DistributedPubSub;
import akka.cluster.pubsub.DistributedPubSubMediator;
import akka.event.Logging;
import akka.event.LoggingAdapter;
import org.apache.jena.query.Dataset;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Component;
import org.springframework.web.client.RestClientException;
import won.matcher.service.common.event.NeedEvent;
import won.matcher.service.common.service.sparql.SparqlService;
import won.matcher.service.crawler.config.CrawlConfig;
import won.matcher.service.crawler.exception.CrawlWrapperException;
import won.matcher.service.crawler.msg.CrawlUriMessage;
import won.matcher.service.crawler.msg.ResourceCrawlUriMessage;
import won.matcher.service.crawler.service.CrawlSparqlService;
import won.protocol.exception.DataIntegrityException;
import won.protocol.exception.IncorrectPropertyCountException;
import won.protocol.model.NeedState;
import won.protocol.util.NeedModelWrapper;
import won.protocol.util.RdfUtils;
import won.protocol.util.linkeddata.LinkedDataSource;
import won.protocol.vocabulary.WON;
import java.net.URI;
import java.util.Set;
/**
* Actor requests linked data URI using HTTP and saves it to a triple store using SPARQL UPDATE query.
* Responds to the sender with extracted URIs from the linked data URI.
*
* This class uses property paths to extract URIs from linked data resources. These property paths are executed
* relative to base URIs. Therefore there are two types of property paths. Base property path extract URIs that are
* taken as new base URIs. Non-base property paths extract URIs that keep the current base URI.
*
* User: hfriedrich
* Date: 07.04.2015
*/
@Component
@Scope("prototype")
public class WorkerCrawlerActor extends UntypedActor
{
private LoggingAdapter log = Logging.getLogger(getContext().system(), this);
@Autowired
private LinkedDataSource linkedDataSource;
@Autowired
private CrawlSparqlService sparqlService;
@Autowired
private CrawlConfig config;
private ActorRef pubSubMediator;
@Override
public void preStart() {
// initialize the distributed event bus to send need events to the matchers
pubSubMediator = DistributedPubSub.get(getContext().system()).mediator();
}
/**
* Receives messages with an URI and processes them by requesting the resource,
* saving it to a triple store, extracting URIs from content and answering the sender.
*
* @param msg if type is {@link CrawlUriMessage} then process it
*/
@Override
public void onReceive(Object msg) throws RestClientException {
if (!(msg instanceof CrawlUriMessage)) {
unhandled(msg);
return;
}
CrawlUriMessage uriMsg = (CrawlUriMessage) msg;
if (!uriMsg.getStatus().equals(CrawlUriMessage.STATUS.PROCESS) &&
!uriMsg.getStatus().equals(CrawlUriMessage.STATUS.SAVE)) {
unhandled(msg);
return;
}
// URI message to process received
// start the crawling request
Dataset ds = null;
// check if resource is already downloaded
if (uriMsg instanceof ResourceCrawlUriMessage) {
ResourceCrawlUriMessage resMsg = ((ResourceCrawlUriMessage) uriMsg);
if (resMsg.getSerializedResource() != null && resMsg.getSerializationFormat() != null) {
try {
// TODO: this should be optimized, why deserialize the resource here when we just want to save it in the RDF
// store? How to insert this serialized resource into the SPARQL endpoint?
ds = SparqlService.deserializeDataset(resMsg.getSerializedResource(), resMsg.getSerializationFormat());
} catch (Exception e) {
throw new CrawlWrapperException(e, uriMsg);
}
}
}
// download resource if not already downloaded
if (ds == null) {
try {
ds = linkedDataSource.getDataForResource(URI.create(uriMsg.getUri()));
} catch (RestClientException e) {
throw new CrawlWrapperException(e, uriMsg);
}
}
// Save dataset to triple store
sparqlService.updateNamedGraphsOfDataset(ds);
String wonNodeUri = extractWonNodeUri(ds, uriMsg.getUri());
if (wonNodeUri == null) {
wonNodeUri = uriMsg.getWonNodeUri();
}
// do nothing more here if the STATUS of the message was SAVE
if (uriMsg.getStatus().equals(CrawlUriMessage.STATUS.SAVE)) {
log.debug("processed crawl uri event {} with status 'SAVE'", uriMsg);
return;
}
// send extracted non-base URIs back to sender and save meta data about crawling the URI
// extract only uris which were crawled at least one recrawl interval ago
long crawlDate = System.currentTimeMillis();
log.debug("Extract non-base URIs from message {}", uriMsg);
Set<String> extractedURIs = sparqlService.extractURIs(
uriMsg.getUri(), uriMsg.getBaseUri(), config.getCrawlNonBasePropertyPaths(),
crawlDate - config.getRecrawlIntervalDuration().toMillis());
for (String extractedURI : extractedURIs) {
CrawlUriMessage newUriMsg = new CrawlUriMessage(
extractedURI, uriMsg.getBaseUri(), wonNodeUri, CrawlUriMessage.STATUS.PROCESS, crawlDate);
getSender().tell(newUriMsg, getSelf());
}
// send extracted base URIs back to sender and save meta data about crawling the URI
// extract only uris which were crawled at least one recrawl interval ago
log.debug("Extract base URIs from message {}", uriMsg);
extractedURIs = sparqlService.extractURIs(uriMsg.getUri(), uriMsg.getBaseUri(), config.getCrawlBasePropertyPaths(),
crawlDate - config.getRecrawlIntervalDuration().toMillis());
for (String extractedURI : extractedURIs) {
CrawlUriMessage newUriMsg = new CrawlUriMessage(
extractedURI, extractedURI, wonNodeUri, CrawlUriMessage.STATUS.PROCESS, crawlDate);
getSender().tell(newUriMsg, getSelf());
}
// signal sender that this URI is processed and save meta data about crawling the URI.
// This needs to be done after all extracted URI messages have been sent to guarantee consistency
// in case of failure
crawlDate = System.currentTimeMillis();
CrawlUriMessage uriDoneMsg = new CrawlUriMessage(
uriMsg.getUri(), uriMsg.getBaseUri(), wonNodeUri, CrawlUriMessage.STATUS.DONE, crawlDate);
log.info("Crawling done for URI {}", uriDoneMsg.getUri());
getSender().tell(uriDoneMsg, getSelf());
// if this URI/dataset was a need then send an event to the distributed event bus
NeedModelWrapper needModelWrapper;
try {
// only send active needs right now
needModelWrapper = new NeedModelWrapper(ds);
NeedState state = needModelWrapper.getNeedState();
if (state.equals(NeedState.ACTIVE)) {
log.debug("Created need event for need uri {}", uriMsg.getUri());
NeedEvent.TYPE type = NeedEvent.TYPE.CREATED;
NeedEvent needEvent = new NeedEvent(uriMsg.getUri(), wonNodeUri, type, crawlDate, ds);
pubSubMediator
.tell(new DistributedPubSubMediator.Publish(needEvent.getClass().getName(), needEvent), getSelf());
}
} catch (DataIntegrityException e) {
log.debug("no valid need model found in dataset for uri {}", uriMsg.getUri());
}
}
/**
* Extract won node uri from a won resource
*
* @param ds resource as dataset
* @param uri uri that represents resource
* @return won node uri or null if link to won node is not linked in the resource
*/
private String extractWonNodeUri(Dataset ds, String uri) {
try {
return RdfUtils.findOnePropertyFromResource(ds, URI.create(uri), WON.HAS_WON_NODE).asResource().getURI();
} catch (IncorrectPropertyCountException e) {
return null;
}
}
public void setSparqlService(final CrawlSparqlService sparqlService) {
this.sparqlService = sparqlService;
}
}