package org.voyanttools.trombone.lucene.analysis;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.tika.io.IOUtils;
import org.voyanttools.trombone.nlp.StanfordNlpAnnotator;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
final public class StanfordNlpLemmaTokenizer extends Tokenizer {
private StanfordNlpAnnotator annotator;
private Iterator<CoreLabel> tokensIterator;
private PositionIncrementAttribute posIncr;
private CharTermAttribute termAtt;
private OffsetAttribute offsetAttribute;
public StanfordNlpLemmaTokenizer(StanfordNlpAnnotator annotator) {
super();
this.annotator = annotator;
posIncr = addAttribute(PositionIncrementAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
offsetAttribute = addAttribute(OffsetAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
CoreLabel token = tokensIterator.next();
termAtt.setEmpty();
String lemma = token.lemma();
if (lemma != null) {
termAtt.append(lemma);
termAtt.setLength(lemma.length());
offsetAttribute.setOffset(token.beginPosition(), token.endPosition());
posIncr.setPositionIncrement(1);
}
return tokensIterator.hasNext();
}
@Override
public void reset() throws IOException {
super.reset();
Annotation document = annotator.getAnnotated(IOUtils.toString(input));
Collection<CoreMap> sentences = document.get(SentencesAnnotation.class);
Collection<CoreLabel> tokens = new ArrayList<CoreLabel>();
for (CoreMap sentence : sentences) {
for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
tokens.add(token);
}
}
tokensIterator = tokens.iterator();
}
}