package org.voyanttools.trombone.lucene.analysis;
import java.io.IOException;
import java.io.StringReader;
import java.util.Collection;
import java.util.Iterator;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.tika.io.IOUtils;
import org.voyanttools.trombone.nlp.OpenNlpAnnotator;
import org.voyanttools.trombone.nlp.PosLemmas;
import opennlp.tools.util.Span;
final public class OpenNlpLemmaTokenizer extends Tokenizer {
private OpenNlpAnnotator annotator;
private Iterator<PosLemmas> tokensIterator;
private PositionIncrementAttribute posIncr;
private CharTermAttribute termAtt;
private OffsetAttribute offsetAttribute;
PosLemmas lemmas = null;
public OpenNlpLemmaTokenizer(OpenNlpAnnotator annotator) {
super();
this.annotator = annotator;
posIncr = addAttribute(PositionIncrementAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
offsetAttribute = addAttribute(OffsetAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
PosLemmas token = tokensIterator.next();
termAtt.setEmpty();
String lemma = token.getCurrentLemma();
if (lemma != null) {
termAtt.append(lemma);
termAtt.setLength(lemma.length());
int correctedStart = correctOffset(token.getCurrentStart());
int correctedEnd = correctOffset(token.getCurrentEnd());
token.setCurrentOffset(correctedStart, correctedEnd);
offsetAttribute.setOffset(correctedStart, correctedEnd);
posIncr.setPositionIncrement(1);
}
return tokensIterator.hasNext();
}
@Override
public void reset() throws IOException {
super.reset();
String text = IOUtils.toString(input);
lemmas = annotator.getPosLemmas(text, annotator.getLang());
tokensIterator = lemmas.iterator();
}
public PosLemmas getPosLemmas() {
return lemmas;
}
}