package org.wikibrain.lucene; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.wikibrain.core.lang.Language; import org.wikibrain.lucene.tokenizers.LanguageTokenizer; import java.io.Reader; /** * * This class is based on a class of the same name from Brent Hecht, WikiBrain. * I have updated everything to properly function consistently with lucene 4.3. * * This class functions as a Lucene Analyzer for a specific language. It runs * off of the functions built into the LanguageTokenizer class. * * TODO: add language overrides for unsupported languages? * In other words, analyze language X as similar language Y * ie. Ukrainian -> Russian and Ladino -> Spanish * * @author Ari Weiland * */ public class WikiBrainAnalyzer extends Analyzer { private final Language language; private final LanguageTokenizer languageTokenizer; private final LuceneOptions options; /** * Constructs a WikiBrainAnalyzer for the specified language with all filters * and default options. * * @param language the language this analyzer analyzes */ public WikiBrainAnalyzer(Language language) { this(language, LuceneOptions.getDefaultOptions()); } /** * Constructs a WikiBrainAnalyzer for the specified language with specified filters * and specified options. * * @param language the language this analyzer analyzes * @param options a LuceneOptions object containing specific options for lucene */ public WikiBrainAnalyzer(Language language, LuceneOptions options) { this.language = language; this.languageTokenizer = LanguageTokenizer.getLanguageTokenizer(language, options); this.options = options; } public Language getLanguage() { return language; } public LuceneOptions getOptions() { return options; } @Override protected Analyzer.TokenStreamComponents createComponents(String s, Reader r) { Tokenizer tokenizer = languageTokenizer.makeTokenizer(r); TokenStream result = languageTokenizer.getTokenStream(tokenizer, CharArraySet.EMPTY_SET); return new TokenStreamComponents(tokenizer, result); } }