StanfordNlpAnnotator.java example

Explorer

trombone-master
- src
  - main
    - java
  - test
    - java
      - org
        voyanttools
        trombone
        document
        MetadataTest.java
        input
        expand
        ArchiveExpanderTest.java
        CompressedExpanderTest.java
        XmlExpanderTest.java
        XslExpanderTest.java
        extract
        BagItExtractorTest.java
        TikaExtractorTest.java
        XmlExtractorTest.java
        index
        LuceneIndexerTest.java
        lucene
        StoredToLuceneDocumentMapperTest.java
        analysis
        OpenNlpLemmaTokenizerTest.java
        StanfordNlpLemmaTokenizerTest.java
        search
        FieldPrefixAwareSimpleQueryParserTest.java
        model
        CorpusCollocateTest.java
        CorpusTermMinimalsDBTest.java
        CorpusTermsQueueTest.java
        DocumentTermsTest.java
        KeywordsTest.java
        TableTest.java
        storage
        file
        FileStoredDocumentSourceStorageTest.java
        TromboneMigration.java
        tool
        DocumentCollocatesTest.java
        DocumentTermsTest.java
        KwicsTest.java
        StoredResourceTest.java
        TableCorrelationsTest.java
        TableManagerTest.java
        build
        CorpusBuilderTest.java
        CorpusCreatorTest.java
        DocumentExpanderTest.java
        DocumentExtractorTest.java
        DocumentStorerTest.java
        corpus
        CorpusCollocatesTest.java
        CorpusFacetsTest.java
        CorpusManagerTest.java
        CorpusTermsTest.java
        DocumentContextsTest.java
        DocumentNgramsTest.java
        DocumentTermsTest.java
        DocumentTokensTest.java
        DocumentsMetadataTest.java
        SimpleSortedSetFacetsExample.java
        VelizaTest.java
        util
        EmbeddedWebServer.java
        TestHelper.java

/**
 * 
 */
package org.voyanttools.trombone.nlp;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import org.apache.commons.lang3.ArrayUtils;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.voyanttools.trombone.lucene.CorpusMapper;
import org.voyanttools.trombone.model.DocumentEntity;
import org.voyanttools.trombone.model.IndexedDocument;
import org.voyanttools.trombone.model.TokenType;
import org.voyanttools.trombone.model.EntityType;
import org.voyanttools.trombone.util.FlexibleParameters;

import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.MentionsAnnotation;
//import edu.stanford.nlp.ling.CoreAnnotations.MentionsAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.NormalizedNamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

/**
 * @author sgs
 *
 */
public class StanfordNlpAnnotator implements NlpAnnotator {
	StanfordCoreNLP pipeline;

	/**
	 * 
	 */
	StanfordNlpAnnotator(String languageCode) {
	    Properties props = new Properties();
	    props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, entitymentions");
	    if (languageCode.equals("fr")) {
	    	props.setProperty("props", "StanfordCoreNLP-french.properties");
	    }
	    pipeline = new StanfordCoreNLP(props);
	}

	@Override
	public List<DocumentEntity> getEntities(CorpusMapper corpusMapper, IndexedDocument indexedDocument, Collection<EntityType> types, FlexibleParameters parameters) throws IOException {
		
		List<CoreMap> entitiesMap = getEntities(indexedDocument.getDocumentString(), types);
		
		// organize by term-name so that we can group together
		Map<String, List<CoreMap>> stringEntitiesMap = new HashMap<String, List<CoreMap>>();
		for (CoreMap entity : entitiesMap) {
			String term = entity.get(TextAnnotation.class);
			EntityType type = EntityType.getForgivingly(entity.get(NamedEntityTagAnnotation.class));
			String key = term+" --" +type.name(); 
			if (!stringEntitiesMap.containsKey(key)) {
				stringEntitiesMap.put(key, new ArrayList<CoreMap>());
			}
			stringEntitiesMap.get(key).add(entity);
		}
		
		Map<Integer, Integer> offsetToTokenPositionMap = parameters.getParameterBooleanValue("withDistribution") ? getOffsetsToPositionsMap(corpusMapper, indexedDocument, entitiesMap) : null;
		
		List<DocumentEntity> entities = new ArrayList<DocumentEntity>();
		int corpusDocumentIndex = corpusMapper.getCorpus().getDocumentPosition(indexedDocument.getId());
		for (Map.Entry<String, List<CoreMap>> stringEntitiesMapEntry : stringEntitiesMap.entrySet()) {
			List<CoreMap> coreMaps = stringEntitiesMapEntry.getValue();
			List<Integer> positions = new ArrayList<Integer>();
			if (offsetToTokenPositionMap!=null) {
				for (CoreMap entity : coreMaps) {
					int startOffset = entity.get(CharacterOffsetBeginAnnotation.class);
					Integer position = offsetToTokenPositionMap.get(startOffset);
					if (position!=null) {positions.add(position);}
				}
			}
			CoreMap entity = coreMaps.get(0);
			String term = entity.get(TextAnnotation.class);
			String normalized = entity.get(NormalizedNamedEntityTagAnnotation.class);
			EntityType type = EntityType.getForgivingly(entity.get(NamedEntityTagAnnotation.class));
			DocumentEntity e = new DocumentEntity(corpusDocumentIndex, term, normalized, type, positions.isEmpty() ? coreMaps.size() : positions.size(),  positions.isEmpty() ? null : ArrayUtils.toPrimitive(positions.toArray(new Integer[0])));
			entities.add(e);
		}
		
		return entities;
	}
	
	private Map<Integer, Integer> getOffsetsToPositionsMap(CorpusMapper corpusMapper, IndexedDocument indexedDocument, List<CoreMap> entitiesMap) throws IOException {
		// go through and collect offsets to keep
		Set<Integer> offsets = new HashSet<Integer>();
		for (CoreMap entity : entitiesMap) {
			offsets.add(entity.get(CharacterOffsetBeginAnnotation.class));
		}
		
		// go through vector to collect tokens of interest
		Map<Integer, Integer> offsetToTokenPositionMap = new HashMap<Integer, Integer>();
		int luceneDoc = corpusMapper.getLuceneIdFromDocumentId(indexedDocument.getId());
		// TODO: check that we can assume that offsets align regardless of TokenType
		Terms terms = corpusMapper.getLeafReader().getTermVector(luceneDoc, TokenType.lexical.name()); 
		TermsEnum termsEnum = terms.iterator();
		while(true) {
			BytesRef term = termsEnum.next();
			if (term!=null) {
				PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
				if (postingsEnum!=null) {
					postingsEnum.nextDoc();
					for (int i=0, len = postingsEnum.freq(); i<len; i++) {
						int pos = postingsEnum.nextPosition();
						int offset = postingsEnum.startOffset();
						if (offsets.contains(offset)) {
							offsetToTokenPositionMap.put(offset, pos);
						}
					}
				}
			}
			else {break;}
		}
		return offsetToTokenPositionMap;
		
	}

	private List<CoreMap> getEntities(String text, Collection<EntityType> types) {
		
		List<CoreMap> sentences = getSentences(text);
		List<CoreMap> entities = new ArrayList<CoreMap>();
	    for(CoreMap sentence: sentences) {
	    	for (CoreMap entity : sentence.get(MentionsAnnotation.class)) {
	    		EntityType type = EntityType.getForgivingly(entity.get(NamedEntityTagAnnotation.class));
	    		if (type!=EntityType.unknnown && (types.isEmpty() || types.contains(type))) {
	    			entities.add(entity);
	    		}
	    	}
	    }
	    return entities;
	}
	
	public Annotation getAnnotated(String text) {
		Annotation document = new Annotation(text);
		pipeline.annotate(document);
		return document;
	}
	
	private List<CoreMap> getSentences(String text) {
		return getAnnotated(text).get(SentencesAnnotation.class);
	}
	
//	public static void main(String[] args) {
//		StanfordNlpAnnotator annotator = new StanfordNlpAnnotator("en");
//		annotator.getEntities("October 1, 2015. This is a test from yesterday in London, UK.");
//	}

}