package org.wikibrain.lucene.tokenizers;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
import org.wikibrain.core.lang.Language;
import org.wikibrain.lucene.TokenizerOptions;
import java.io.Reader;
/**
* @author Ari Weiland
*/
public class ChineseTokenizer extends LanguageTokenizer{
protected ChineseTokenizer(Version version, TokenizerOptions options, Language language) {
super(version, options, language);
}
@Override
public Tokenizer makeTokenizer(Reader r) {
return new SentenceTokenizer(r);
}
@Override
public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) {
TokenStream stream = new WordTokenFilter(tokenizer); // breaks Sentences into words
// stream = new LowerCaseFilter(stream);
// LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
// The porter stemming is too strict, this is not a bug, this is a feature:)
if (useStopWords)
stream = new StopFilter(matchVersion, stream, SmartChineseAnalyzer.getDefaultStopSet());
if (useStem)
stream = new PorterStemFilter(stream);
return stream;
}
}