/** * */ package org.voyanttools.trombone.lucene.analysis; import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.voyanttools.trombone.model.TokenType; import org.voyanttools.trombone.nlp.NlpFactory; import org.voyanttools.trombone.nlp.OpenNlpAnnotator; import org.voyanttools.trombone.nlp.PosLemmas; /** * @author sgs * */ public class LemmaAnalyzer extends LexicalAnalyzer { private NlpFactory factory; private OpenNlpLemmaTokenizer openNlpTokenizer; public LemmaAnalyzer(NlpFactory factory) { this.factory = factory; } @Override protected Reader initReader(String fieldName, Reader reader) { if (fieldName.equals(TokenType.lemma.name())) { reader = initReader(reader); } else { parameters.clear(); } try { return new HTMLCharFilter(reader); } catch (IOException e) { throw new RuntimeException(e); } } @Override protected TokenStreamComponents createComponents(String fieldName) { if (fieldName.equals(TokenType.lemma.name()) && lang!=null && lang.isEmpty()==false) { /* NlpAnnotator annotator = factory.getNlpAnnotator(lang); if (annotator instanceof StanfordNlpAnnotator) { Tokenizer tokenizer = new StanfordNlpLemmaTokenizer((StanfordNlpAnnotator) annotator); TokenStream stream = new LowerCaseFilter(tokenizer); return new TokenStreamComponents(tokenizer, stream); } */ if (lang.equals("en") || lang.equals("fr") || lang.equals("de") || lang.equals("it") || lang.equals("nl") || lang.equals("es")) { OpenNlpAnnotator annotator; try { annotator = factory.getOpenNlpAnnotator(lang); } catch (IOException e) { throw new RuntimeException("Unable to load lemmatizer for language: "+lang); } openNlpTokenizer = new OpenNlpLemmaTokenizer(annotator); TokenStream stream = new LowerCaseFilter(openNlpTokenizer); return new TokenStreamComponents(openNlpTokenizer, stream); } else { throw new RuntimeException("Unable to create Lemmatizer for "+lang); } } // not sure this is a good idea, but let's use lexical forms for now return super.createComponents(TokenType.lexical.name()); } public PosLemmas getPostStreamPosLemmas() { return openNlpTokenizer.getPosLemmas(); } }