package org.myrobotlab.document.transformer; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.apache.commons.lang.StringUtils; import org.myrobotlab.document.Document; import org.myrobotlab.logging.LoggerFactory; import org.slf4j.Logger; import opennlp.tools.namefind.NameFinderME; import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSTaggerME; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.Span; public class NounPhraseExtractor extends AbstractStage { public final static Logger log = LoggerFactory.getLogger(NounPhraseExtractor.class.getCanonicalName()); private String personModelFile = "./opennlp/en-ner-person.bin"; private String sentenceModelFile = "./opennlp/en-sent.bin"; private String tokenModelFile = "./opennlp/en-token.bin"; private String posModelFile = "./opennlp/en-pos-maxent.bin"; // TODO: These are NOT thread safe!!!! WorkflowServer must be single threaded // until we make these thread safe... :-/ private SentenceDetectorME sentenceDetector; private Tokenizer tokenizer; private NameFinderME nameFinder; private POSTaggerME posTagger; private String textField = "text"; private String peopleField = "people"; private String posTextField = "pos_text"; private String sep = " "; @Override public void startStage(StageConfiguration config) { // parse the config to map the params properly textField = config.getProperty("textField", textField); peopleField = config.getProperty("peopleField", peopleField); posTextField = config.getProperty("posTextField", posTextField); try { // Sentence finder SentenceModel sentModel = new SentenceModel(new FileInputStream(sentenceModelFile)); sentenceDetector = new SentenceDetectorME(sentModel); // tokenizer TokenizerModel tokenModel = new TokenizerModel(new FileInputStream(tokenModelFile)); tokenizer = new TokenizerME(tokenModel); // person name finder TokenNameFinderModel nameModel = new TokenNameFinderModel(new FileInputStream(personModelFile)); nameFinder = new NameFinderME(nameModel); // load the part of speech tagger. posTagger = new POSTaggerME(new POSModel(new FileInputStream(posModelFile))); } catch (IOException e) { log.info("Error loading up OpenNLP Models. {}", e.getLocalizedMessage()); e.printStackTrace(); } } @Override public List<Document> processDocument(Document doc) { log.info("Processing Doc: {}", doc.getId()); ArrayList<Document> children = new ArrayList<Document>(); if (!doc.hasField(textField)) { log.info("No Text Field On Document {}", doc.getId()); return null; } for (Object o : doc.getField(textField)) { if (o == null) { log.info("Null field value! Field : {} Doc: {}", textField, doc.getId()); continue; } if (o instanceof String) { String text = o.toString(); if (StringUtils.isEmpty(text)) { // skip empty/null strings continue; } String sentences[] = sentenceDetector.sentDetect(text); for (String sentence : sentences) { if (StringUtils.isEmpty(sentence)) { log.info("Empty sentence..."); continue; } String tokens[] = tokenizer.tokenize(sentence); Span[] spans = nameFinder.find(tokens); // part of speech tagging String posText = posTagger.tag(sentence); // extract a triple from the sentence. children.addAll(createTripleDocuments(doc.getId(), posText)); doc.addToField(posTextField, posText); for (Span span : spans) { String[] terms = Arrays.copyOfRange(tokens, span.getStart(), span.getEnd()); String entity = StringUtils.join(terms, sep); doc.addToField(peopleField, entity); } } } else { log.info("Only Strings will be processed not {}", o.getClass()); } } // TODO: move this into it's own stage. but for now, this is just to poc it. children.addAll(createEntityMentionDocs(doc)); log.info("Extracted {} children records from that document.", children.size()); for (Document d : children) { log.info(d.toString()); } return children; } private List<Document> createTripleDocuments(String parentId, String posText) { // TODO : implement a much better tuned grammar for parsing // subject/object/verb // this is very likely language dependent. ArrayList<Document> childrenDocs = new ArrayList<Document>(); // we'll look for the nouns, then the verbs, then the nouns again. (add an // end element to the sentence) String[] parts = (posText + " END/END").split(" "); // System.out.println("#######################################"); // we want to find the runs of n* and v* ... ArrayList<String> subjects = new ArrayList<String>(); ArrayList<String> verbs = new ArrayList<String>(); ArrayList<String> objects = new ArrayList<String>(); StringBuilder currentSubject = new StringBuilder(); StringBuilder currentVerb = new StringBuilder(); StringBuilder currentObject = new StringBuilder(); // state info for the iteration String prevPOS = ""; boolean seenFirstVerb = false; for (String part : parts) { part = part.trim(); if (StringUtils.isEmpty(part) || !part.contains("/")) { continue; } String[] subpart = part.split("/"); String word = subpart[0]; String pos = subpart[1]; // System.out.println("WORD: " + word + " POS: " + pos + " PREV: " + // prevPOS); // NN to not NN ends nouns // not NN to NN starts nouns. if (pos.startsWith("N")) { if (seenFirstVerb) { currentObject.append(word + " "); } else { currentSubject.append(word + " "); } } if (prevPOS.startsWith("N") && !pos.startsWith("N")) { if (!seenFirstVerb) { String subjectName = currentSubject.toString().trim(); if (!StringUtils.isEmpty(subjectName)) { subjects.add(subjectName); } currentSubject = new StringBuilder(); } else { String objName = currentObject.toString().trim(); if (!StringUtils.isEmpty(objName)) { objects.add(objName); } currentObject = new StringBuilder(); } } // now for verb phrases. if (pos.startsWith("V") || "JJ".equals(pos)) { seenFirstVerb = true; currentVerb.append(word + " "); } if (prevPOS.startsWith("V") && !pos.startsWith("V")) { String verbName = currentVerb.toString().trim(); if (!StringUtils.isEmpty(verbName)) { verbs.add(verbName); } currentVerb = new StringBuilder(); } prevPOS = pos; } // now we want to see what all the verbs/nouns we found are. // carteasean expansion.. just for fun! for (String subject : subjects) { for (String verb : verbs) { for (String object : objects) { if (!subject.equals(object)) { // System.out.println("Subject:" + subject + " VERB:" + verb + " // OBJECT:" + object); // lets create a child document for each of these combindations. // TODO: something better than this.. String childId = "triple_" + parentId + "_" + subject + " " + verb + " " + object; Document child = new Document(childId); child.setField("table", "triple"); child.setField("subject", subject); child.setField("verb", verb); child.setField("object", object); child.setField("parent_id", parentId); // TODO: sanitized this fieldname String normVerb = normalizeFieldName(verb); child.setField(normVerb + "_verb", object); // add it to the list of docs that we've created. childrenDocs.add(child); } } } } return childrenDocs; } private String normalizeFieldName(String verb) { // TODO Auto-generated method stub String cleanVerb = verb.replaceAll(" ", "_").toLowerCase(); return cleanVerb; } private List<Document> createEntityMentionDocs(Document doc) { // TODO Auto-generated method stub ArrayList<Document> docs = new ArrayList<Document>(); // we have the fact that certain people are actually people. if (!doc.hasField(peopleField)) { log.info("No people found..."); return docs; } for (Object o : doc.getField(peopleField)) { // the unique id for this, is the doc id and the person // TODO: handle person name collisions. // TODO: something better, but for now this is good enough. String docId = "person_" + doc.getId() + "_" + o.toString(); Document personDoc = new Document(docId); personDoc.setField("person", o.toString()); // TODO: consider some better ideas for how to set these for each person. personDoc.setField("parent_id", doc.getId()); personDoc.setField("node_id", o.toString()); // maybe copy some other field from the parent doc? personDoc.setField("is_verb", "person"); } return docs; } @Override public void stopStage() { // TODO: close/shutdown the models! } @Override public void flush() { // no op , i believe. return; } public String getPersonModelFile() { return personModelFile; } public void setPersonModelFile(String personModelFile) { this.personModelFile = personModelFile; } public String getSentenceModelFile() { return sentenceModelFile; } public void setSentenceModelFile(String sentenceModelFile) { this.sentenceModelFile = sentenceModelFile; } public String getTokenModelFile() { return tokenModelFile; } public void setTokenModelFile(String tokenModelFile) { this.tokenModelFile = tokenModelFile; } public String getTextField() { return textField; } public void setTextField(String textField) { this.textField = textField; } public String getPeopleField() { return peopleField; } public void setPeopleField(String peopleField) { this.peopleField = peopleField; } public String getPosTextField() { return posTextField; } public void setPostTextField(String posTextField) { this.posTextField = posTextField; } }