package org.wikibrain.lucene.tokenizers;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.nl.DutchAnalyzer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.DutchStemmer;
import org.wikibrain.core.lang.Language;
import org.wikibrain.lucene.TokenizerOptions;
/**
* @author Ari Weiland
*/
public class DutchTokenizer extends LanguageTokenizer {
protected DutchTokenizer(Version version, TokenizerOptions options, Language language) {
super(version, options, language);
}
// static final CharArrayMap<String> DEFAULT_STEM_DICT;
// static {
// DEFAULT_STEM_DICT = new CharArrayMap<String>(matchVersion, 4, false);
// DEFAULT_STEM_DICT.put("fiets", "fiets"); //otherwise fiet
// DEFAULT_STEM_DICT.put("bromfiets", "bromfiets"); //otherwise bromfiet
// DEFAULT_STEM_DICT.put("ei", "eier");
// DEFAULT_STEM_DICT.put("kind", "kinder");
// }
@Override
public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) {
TokenStream stream = new StandardFilter(matchVersion, tokenizer);
if (caseInsensitive)
stream = new LowerCaseFilter(matchVersion, stream);
if (useStopWords)
stream = new StopFilter(matchVersion, stream, DutchAnalyzer.getDefaultStopSet());
if (useStem) {
if (!stemExclusionSet.isEmpty())
stream = new SetKeywordMarkerFilter(stream, stemExclusionSet);
// stream = new StemmerOverrideFilter(stream, DEFAULT_STEM_DICT); // TODO: Dafuq
stream = new SnowballFilter(stream, new DutchStemmer());
}
return stream;
}
}