package org.myrobotlab.document.transformer;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.myrobotlab.document.Document;
/**
* This stage will use a jsoup selection string on html and store the resulting
* data int the output field
*
* @author kwatters
*
*/
public class JSoupExtractor extends AbstractStage {
private String htmlField = "html";
private String outputField = "links";
private String jSoupSelector = "a[href]";
@Override
public void startStage(StageConfiguration config) {
if (config != null) {
htmlField = config.getProperty(htmlField, "html");
outputField = config.getProperty("outputField", "links");
jSoupSelector = config.getProperty("jSoupSelector", "a[href]");
}
}
@Override
public List<Document> processDocument(Document doc) {
for (Object o : doc.getField(htmlField)) {
org.jsoup.nodes.Document jSoupDoc = Jsoup.parse(o.toString());
Elements links = jSoupDoc.select(jSoupSelector);
for (Element link : links) {
doc.addToField(outputField, link);
}
}
return null;
}
@Override
public void stopStage() {
// TODO Auto-generated method stub
}
@Override
public void flush() {
// TODO Auto-generated method stub
}
public String getHtmlField() {
return htmlField;
}
public void setHtmlField(String htmlField) {
this.htmlField = htmlField;
}
public String getOutputField() {
return outputField;
}
public void setOutputField(String outputField) {
this.outputField = outputField;
}
public String getjSoupSelector() {
return jSoupSelector;
}
public void setjSoupSelector(String jSoupSelector) {
this.jSoupSelector = jSoupSelector;
}
}