package org.wikibrain.lucene.tokenizers;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseBaseFormFilter;
import org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilter;
import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
import org.wikibrain.core.lang.Language;
import org.wikibrain.lucene.TokenizerOptions;
import java.io.Reader;
/**
* @author Ari Weiland
*/
public class JapaneseTokenizer extends LanguageTokenizer {
protected JapaneseTokenizer(Version version, TokenizerOptions options, Language language) {
super(version, options, language);
}
@Override
public Tokenizer makeTokenizer(Reader r) {
return new org.apache.lucene.analysis.ja.JapaneseTokenizer(r, null, false, org.apache.lucene.analysis.ja.JapaneseTokenizer.DEFAULT_MODE);
}
@Override
public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) {
TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
stream = new CJKWidthFilter(stream);
if (caseInsensitive)
stream = new LowerCaseFilter(matchVersion, stream);
if (useStopWords) {
stream = new JapanesePartOfSpeechStopFilter(true, stream, JapaneseAnalyzer.getDefaultStopTags());
stream = new StopFilter(matchVersion, stream, JapaneseAnalyzer.getDefaultStopSet());
}
if (useStem)
stream = new JapaneseKatakanaStemFilter(stream);
return stream;
}
}