package org.wikibrain.lucene.tokenizers;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.it.ItalianAnalyzer;
import org.apache.lucene.analysis.it.ItalianLightStemFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ElisionFilter;
import org.apache.lucene.util.Version;
import org.wikibrain.core.lang.Language;
import org.wikibrain.lucene.TokenizerOptions;
import java.util.Arrays;
/**
* @author Ari Weiland
*/
public class ItalianTokenizer extends LanguageTokenizer {
private final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
new CharArraySet(matchVersion, Arrays.asList(
"c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell",
"gl", "agl", "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d"
), true));
protected ItalianTokenizer(Version version, TokenizerOptions options, Language language) {
super(version, options, language);
}
@Override
public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) {
TokenStream stream = new StandardFilter(matchVersion, tokenizer);
if (caseInsensitive)
stream = new LowerCaseFilter(matchVersion, stream);
if (useStopWords) {
stream = new ElisionFilter(stream, DEFAULT_ARTICLES);
stream = new StopFilter(matchVersion, stream, ItalianAnalyzer.getDefaultStopSet());
}
if (useStem) {
if (!stemExclusionSet.isEmpty())
stream = new SetKeywordMarkerFilter(stream, stemExclusionSet);
stream = new ItalianLightStemFilter(stream);
}
return stream;
}
}