/**
*
*/
package org.voyanttools.trombone.nlp;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.voyanttools.trombone.lucene.CorpusMapper;
import org.voyanttools.trombone.model.DocumentEntity;
import org.voyanttools.trombone.model.IndexedDocument;
import org.voyanttools.trombone.model.TokenType;
import org.voyanttools.trombone.model.EntityType;
import org.voyanttools.trombone.util.FlexibleParameters;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.MentionsAnnotation;
//import edu.stanford.nlp.ling.CoreAnnotations.MentionsAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.NormalizedNamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
/**
* @author sgs
*
*/
public class StanfordNlpAnnotator implements NlpAnnotator {
StanfordCoreNLP pipeline;
/**
*
*/
StanfordNlpAnnotator(String languageCode) {
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, entitymentions");
if (languageCode.equals("fr")) {
props.setProperty("props", "StanfordCoreNLP-french.properties");
}
pipeline = new StanfordCoreNLP(props);
}
@Override
public List<DocumentEntity> getEntities(CorpusMapper corpusMapper, IndexedDocument indexedDocument, Collection<EntityType> types, FlexibleParameters parameters) throws IOException {
List<CoreMap> entitiesMap = getEntities(indexedDocument.getDocumentString(), types);
// organize by term-name so that we can group together
Map<String, List<CoreMap>> stringEntitiesMap = new HashMap<String, List<CoreMap>>();
for (CoreMap entity : entitiesMap) {
String term = entity.get(TextAnnotation.class);
EntityType type = EntityType.getForgivingly(entity.get(NamedEntityTagAnnotation.class));
String key = term+" --" +type.name();
if (!stringEntitiesMap.containsKey(key)) {
stringEntitiesMap.put(key, new ArrayList<CoreMap>());
}
stringEntitiesMap.get(key).add(entity);
}
Map<Integer, Integer> offsetToTokenPositionMap = parameters.getParameterBooleanValue("withDistribution") ? getOffsetsToPositionsMap(corpusMapper, indexedDocument, entitiesMap) : null;
List<DocumentEntity> entities = new ArrayList<DocumentEntity>();
int corpusDocumentIndex = corpusMapper.getCorpus().getDocumentPosition(indexedDocument.getId());
for (Map.Entry<String, List<CoreMap>> stringEntitiesMapEntry : stringEntitiesMap.entrySet()) {
List<CoreMap> coreMaps = stringEntitiesMapEntry.getValue();
List<Integer> positions = new ArrayList<Integer>();
if (offsetToTokenPositionMap!=null) {
for (CoreMap entity : coreMaps) {
int startOffset = entity.get(CharacterOffsetBeginAnnotation.class);
Integer position = offsetToTokenPositionMap.get(startOffset);
if (position!=null) {positions.add(position);}
}
}
CoreMap entity = coreMaps.get(0);
String term = entity.get(TextAnnotation.class);
String normalized = entity.get(NormalizedNamedEntityTagAnnotation.class);
EntityType type = EntityType.getForgivingly(entity.get(NamedEntityTagAnnotation.class));
DocumentEntity e = new DocumentEntity(corpusDocumentIndex, term, normalized, type, positions.isEmpty() ? coreMaps.size() : positions.size(), positions.isEmpty() ? null : ArrayUtils.toPrimitive(positions.toArray(new Integer[0])));
entities.add(e);
}
return entities;
}
private Map<Integer, Integer> getOffsetsToPositionsMap(CorpusMapper corpusMapper, IndexedDocument indexedDocument, List<CoreMap> entitiesMap) throws IOException {
// go through and collect offsets to keep
Set<Integer> offsets = new HashSet<Integer>();
for (CoreMap entity : entitiesMap) {
offsets.add(entity.get(CharacterOffsetBeginAnnotation.class));
}
// go through vector to collect tokens of interest
Map<Integer, Integer> offsetToTokenPositionMap = new HashMap<Integer, Integer>();
int luceneDoc = corpusMapper.getLuceneIdFromDocumentId(indexedDocument.getId());
// TODO: check that we can assume that offsets align regardless of TokenType
Terms terms = corpusMapper.getLeafReader().getTermVector(luceneDoc, TokenType.lexical.name());
TermsEnum termsEnum = terms.iterator();
while(true) {
BytesRef term = termsEnum.next();
if (term!=null) {
PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
if (postingsEnum!=null) {
postingsEnum.nextDoc();
for (int i=0, len = postingsEnum.freq(); i<len; i++) {
int pos = postingsEnum.nextPosition();
int offset = postingsEnum.startOffset();
if (offsets.contains(offset)) {
offsetToTokenPositionMap.put(offset, pos);
}
}
}
}
else {break;}
}
return offsetToTokenPositionMap;
}
private List<CoreMap> getEntities(String text, Collection<EntityType> types) {
List<CoreMap> sentences = getSentences(text);
List<CoreMap> entities = new ArrayList<CoreMap>();
for(CoreMap sentence: sentences) {
for (CoreMap entity : sentence.get(MentionsAnnotation.class)) {
EntityType type = EntityType.getForgivingly(entity.get(NamedEntityTagAnnotation.class));
if (type!=EntityType.unknnown && (types.isEmpty() || types.contains(type))) {
entities.add(entity);
}
}
}
return entities;
}
public Annotation getAnnotated(String text) {
Annotation document = new Annotation(text);
pipeline.annotate(document);
return document;
}
private List<CoreMap> getSentences(String text) {
return getAnnotated(text).get(SentencesAnnotation.class);
}
// public static void main(String[] args) {
// StanfordNlpAnnotator annotator = new StanfordNlpAnnotator("en");
// annotator.getEntities("October 1, 2015. This is a test from yesterday in London, UK.");
// }
}