package org.voyanttools.trombone.lucene.analysis; import static org.junit.Assert.*; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.junit.Test; import org.voyanttools.trombone.nlp.NlpAnnotator; import org.voyanttools.trombone.nlp.NlpFactory; import org.voyanttools.trombone.nlp.StanfordNlpAnnotator; public class StanfordNlpLemmaTokenizerTest { @Test public void test() throws IOException { NlpFactory nlpFactory = new NlpFactory(); NlpAnnotator annotator = nlpFactory.getNlpAnnotator("en"); CharTermAttribute termAtt; if (annotator instanceof StanfordNlpAnnotator) { Tokenizer tokenizer = new StanfordNlpLemmaTokenizer((StanfordNlpAnnotator) annotator); Reader reader = new HTMLCharFilter(new StringReader("These dogs <b>are</b> interesting.")); tokenizer.setReader(reader); tokenizer.reset(); String[] lemmas = new String[]{"these","dog","be","interesting"}; int i = 0; while (tokenizer.incrementToken()) { termAtt = tokenizer.getAttribute(CharTermAttribute.class); assertEquals(termAtt.toString(), lemmas[i]); System.out.println(tokenizer.getAttribute(OffsetAttribute.class).startOffset()); i++; } assertEquals(i, 4); // FIXME: why is the last lemma being dropped from the iterator? tokenizer.end(); tokenizer.close(); } /* annotator = nlpFactory.getNlpAnnotator("fr"); if (annotator instanceof StanfordNlpAnnotator) { Tokenizer tokenizer = new StanfordNlpLemmaTokenizer((StanfordNlpAnnotator) annotator); Reader reader = new HTMLCharFilter(new StringReader("Ces chiens <b>sont</b> intéressants.")); tokenizer.setReader(reader); tokenizer.reset(); String[] lemmas = new String[]{"ce","chien","sont","intéressant"}; int i = 0; while (tokenizer.incrementToken()) { termAtt = tokenizer.getAttribute(CharTermAttribute.class); assertEquals(termAtt.toString(), lemmas[i]); System.out.println(tokenizer.getAttribute(OffsetAttribute.class).startOffset()); i++; } assertEquals(i, 4); // FIXME: why is the last lemma being dropped from the iterator? tokenizer.end(); tokenizer.close(); } */ } }