package org.wikibrain.lucene.tokenizers; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; import org.apache.lucene.analysis.pl.PolishAnalyzer; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.stempel.StempelFilter; import org.apache.lucene.analysis.stempel.StempelStemmer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.Version; import org.wikibrain.core.lang.Language; import org.wikibrain.lucene.TokenizerOptions; import java.io.IOException; /** * @author Ari Weiland */ public class PolishTokenizer extends LanguageTokenizer { private static StempelStemmer stemmer; protected PolishTokenizer(Version version, TokenizerOptions options, Language language) { super(version, options, language); } @Override public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) { if (stemmer == null) { try{ stemmer = new StempelStemmer(StempelStemmer.load(PolishAnalyzer.class.getResourceAsStream("stemmer_20000.tbl"))); } catch (IOException e) { throw new RuntimeException(e); } } TokenStream stream = new StandardFilter(matchVersion, tokenizer); if (caseInsensitive) stream = new LowerCaseFilter(matchVersion, stream); if (useStopWords) stream = new StopFilter(matchVersion, stream, PolishAnalyzer.getDefaultStopSet()); if (useStem) { if (!stemExclusionSet.isEmpty()) stream = new SetKeywordMarkerFilter(stream, stemExclusionSet); stream = new StempelFilter(stream, stemmer); } return stream; } }