package org.myrobotlab.service;
import java.util.List;
import org.myrobotlab.document.Document;
import org.myrobotlab.document.ProcessingStatus;
import org.myrobotlab.document.connector.ConnectorState;
import org.myrobotlab.document.transformer.StageConfiguration;
import org.myrobotlab.document.transformer.WorkflowConfiguration;
import org.myrobotlab.document.workflow.WorkflowMessage;
import org.myrobotlab.document.workflow.WorkflowServer;
import org.myrobotlab.framework.Service;
import org.myrobotlab.framework.ServiceType;
import org.myrobotlab.service.interfaces.DocumentListener;
import org.myrobotlab.service.interfaces.DocumentPublisher;
public class DocumentPipeline extends Service implements DocumentListener, DocumentPublisher {
private static final long serialVersionUID = 1L;
private WorkflowConfiguration config;
private transient WorkflowServer workflowServer;
private String workflowName = "default";
public DocumentPipeline(String reservedKey) {
super(reservedKey);
}
static public String[] getCategories() {
// TODO Auto-generated method stub
return new String[] { "data" };
}
public void setConfig(WorkflowConfiguration workflowConfig) {
this.config = workflowConfig;
}
@Override
public Document publishDocument(Document doc) {
// publish the document to the framework
return doc;
}
@Override
public void addDocumentListener(DocumentListener listener) {
// TODO Auto-generated method stub
// ??
// subscribe("publishDocument", topicMethod, callbackName, callbackMethod);
}
@Override
public ProcessingStatus onDocument(Document doc) {
// TODO Auto-generated method stub
// process the document! return a processing status!
WorkflowMessage msg = new WorkflowMessage();
msg.setDoc(doc);
// for now only default workflow supported (1 workflow per DocumentPipeline
// service i guess?)
msg.setWorkflow(workflowName);
// TODO: the type message should be add/update/delete sort of message types.
// msg.setType(type);
try {
workflowServer.processMessage(msg);
} catch (InterruptedException e) {
e.printStackTrace();
// TODO: this isn't correct!
return ProcessingStatus.ERROR;
}
// TODO: we need to properly track the status of the message we just sent
// off..
// callbacks should be re-designed here .. this processing status is kinda
// not correct.
return ProcessingStatus.OK;
}
public void flush() {
while (getInbox().size() > 0) {
// TODO: we've gotta wait until we've consumed our inbox if we're
// flushing?
// TODO: This seems dangerous if we want to flush while continously
// feeding
// we'll never get to flush unless data pauses while we catchup.
try {
log.info("Waiting for inbox to drain...Size: {}", getInbox().size());
Thread.sleep(500);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
workflowServer.flush(workflowName);
// TODO: what if our inbox isn't empty?
}
public static void main(String[] args) throws Exception {
// create the pipeline service in MRL
DocumentPipeline pipeline = (DocumentPipeline) Runtime.start("docproc", "DocumentPipeline");
// pipeline.workflowName = "default";
// create a workflow to load into that pipeline service
WorkflowConfiguration workflowConfig = new WorkflowConfiguration("default");
workflowConfig.setName("default");
StageConfiguration stage1Config = new StageConfiguration();
stage1Config.setStageClass("org.myrobotlab.document.transformer.SetStaticFieldValue");
stage1Config.setStageName("SetTableField");
stage1Config.setStringParam("table", "MRL");
workflowConfig.addStage(stage1Config);
StageConfiguration stage2Config = new StageConfiguration();
stage2Config.setStageClass("org.myrobotlab.document.transformer.SendToSolr");
stage2Config.setStageName("SendToSolr");
stage2Config.setStringParam("solrUrl", "http://phobos:8983/solr/graph");
workflowConfig.addStage(stage2Config);
pipeline.setConfig(workflowConfig);
pipeline.initalize();
RSSConnector connector = (RSSConnector) Runtime.start("rss", "RSSConnector");
connector.addDocumentListener(pipeline);
connector.startCrawling();
// TODO: make sure we flush the pending batches!
// connector.flush();
// poll to make sure the connector is still running./
while (ConnectorState.RUNNING.equals(connector.getState())) {
System.out.println(".");
Thread.sleep(1000);
}
// when the connector is done, tell the pipeline to flush/
pipeline.flush();
// wee! news!
}
public void initalize() throws ClassNotFoundException {
// init the workflow server and load the pipeline config.
if (workflowServer == null) {
workflowServer = WorkflowServer.getInstance();
}
workflowServer.addWorkflow(config);
workflowName = config.getName();
// We can't drop messages! apply back pressure if the inbox is full!
this.inbox.setBlocking(true);
}
// TODO: put this on a base class or something?
public ProcessingStatus onDocuments(List<Document> docs) {
ProcessingStatus totalStat = ProcessingStatus.OK;
for (Document d : docs) {
ProcessingStatus stat = onDocument(d);
if (ProcessingStatus.ERROR.equals(stat)) {
totalStat = ProcessingStatus.ERROR;
}
}
return totalStat;
}
@Override
public boolean onFlush() {
// here we need to pass a flush message to the workflow server
workflowServer.flush(workflowName);
return true;
}
/**
* This static method returns all the details of the class without it having
* to be constructed. It has description, categories, dependencies, and peer
* definitions.
*
* @return ServiceType - returns all the data
*
*/
static public ServiceType getMetaData() {
ServiceType meta = new ServiceType(DocumentPipeline.class.getCanonicalName());
meta.addDescription("This service will pass a document through a document processing pipeline made up of transformers");
meta.addCategory("ingest");
return meta;
}
}