package org.myrobotlab.document.transformer;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.myrobotlab.document.Document;
/**
* This stage will use a regex to find a pattern in a string field and store the
* matched text into the output field.
*
* The list of keepGroups tells the RegexEtractor which groups from the regular
* expression to keep. Groups are concatenated to form the output value.
*
* @author kwatters, dmeehl
*
*/
public class RegexExtractor extends AbstractStage {
private String inputField = null;
private String outputField = null;
private List<Integer> keepGroups = null;
private String regex = null;
private Pattern pattern;
@Override
public void startStage(StageConfiguration config) {
if (config != null) {
inputField = config.getProperty("inputField", "text");
outputField = config.getProperty("outputField", "entity");
List<String> keepGroupsStr = config.getListParam("keepGroups");
regex = config.getProperty("regex");
processOnlyNull = config.getBoolParam("processOnlyNull", processOnlyNull);
keepGroups = new ArrayList<Integer>();
if (keepGroupsStr == null) {
keepGroups.add(1);
} else {
for (String groupNum : keepGroupsStr) {
keepGroups.add(Integer.parseInt(groupNum));
}
}
}
pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.DOTALL | Pattern.MULTILINE);
}
@Override
public List<Document> processDocument(Document doc) {
if (!doc.hasField(inputField)) {
return null;
}
if (processOnlyNull && doc.hasField(outputField)) {
return null;
}
List<String> matches = new ArrayList<String>();
for (Object o : doc.getField(inputField)) {
String text = o.toString();
Matcher matcher = pattern.matcher(text);
if (matcher.matches() && matcher.groupCount() > 0) {
String match = "";
for (Integer num : keepGroups) {
match += matcher.group(num);
}
matches.add(match);
}
}
doc.removeField(outputField);
for (String match : matches) {
doc.addToField(outputField, match);
}
// this stage doesn't emit child docs.
return null;
}
@Override
public void stopStage() {
}
@Override
public void flush() {
}
public String getInputField() {
return inputField;
}
public void setInputField(String inputField) {
this.inputField = inputField;
}
public String getOutputField() {
return outputField;
}
public void setOutputField(String outputField) {
this.outputField = outputField;
}
public String getRegex() {
return regex;
}
public void setRegex(String regex) {
this.regex = regex;
}
}