package org.wikibrain.lucene.tokenizers;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
import org.wikibrain.core.lang.Language;
import org.wikibrain.lucene.TokenizerOptions;
import org.wikibrain.lucene.LuceneOptions;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
*
* This class is based on a class of the same name from Brent Hecht, WikiBrain.
* I have updated everything to properly function consistently with Lucene 4.3.
*
* This class is used to generate Tokenizers for specific languages. It allows for
* specifying different types of filters to apply to the child Tokenizers.
*
* There are currently 35 language-specific tokenizer subclasses, plus a
* DefaultTokenizer that will do its best on all other languages.
* Note that simple English is treated as standard English
*
* @author Ari Weiland
*
*/
public abstract class LanguageTokenizer {
private static final String STOP_WORDS = "src/main/resources/stopwords/";
private static Map<Language, Class> tokenizerClasses;
protected final Version matchVersion;
protected final boolean caseInsensitive;
protected final boolean useStopWords;
protected final boolean useStem;
protected final Language language;
protected LanguageTokenizer(Version version, TokenizerOptions tokenizerOptions, Language language) {
this.matchVersion = version;
this.caseInsensitive = tokenizerOptions.isCaseInsensitive();
this.useStopWords = tokenizerOptions.doesUseStopWords();
this.useStem = tokenizerOptions.doesUseStem();
this.language = language;
}
/**
* Primary workhorse method of this class. Children will implement this and apply
* appropriate filters to return a TokenStream.
*
* @param tokenizer
* @param stemExclusionSet
* @return
*/
public abstract TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet);
public Tokenizer makeTokenizer(Reader r) {
return new StandardTokenizer(matchVersion, r);
}
public TokenStream getTokenStream(Reader r) {
return getTokenStream(makeTokenizer(r), CharArraySet.EMPTY_SET);
}
public TokenizerOptions getTokenizerOptions() {
TokenizerOptions options = new TokenizerOptions();
if (caseInsensitive) options.caseInsensitive();
if (useStopWords) options.useStopWords();
if (useStem) options.useStem();
return options;
}
public Language getLanguage() {
return language;
}
/**
* Returns an instance of a LanguageTokenizer for the specified language
* with the filters specified by opts.
*
* @param language the language of the tokenizer to be retrieved
* @param opts the LuceneOptions object
* @return a LanguageTokenizer for language configured by opts
*/
public static LanguageTokenizer getLanguageTokenizer(Language language, LuceneOptions opts) {
try {
if (language.equals(Language.getByLangCode("simple"))) language = Language.getByLangCode("en"); // simple english
if (tokenizerClasses.containsKey(language)) { // is just english
return (LanguageTokenizer) tokenizerClasses.get(language)
.getDeclaredConstructor(
Version.class,
TokenizerOptions.class,
Language.class)
.newInstance(
opts.matchVersion,
opts.options,
language);
} else {
return new DefaultTokenizer(
opts.matchVersion,
opts.options,
language);
}
} catch (Exception e) {
throw new RuntimeException(e); // These exceptions are based on hard code and should never get thrown
}
}
/**
* Returns an instance of a LanguageTokenizer for the specified language
* with the filters specified by opts.
*
* @param language the language of the tokenizer to be retrieved
* @param opts the LuceneOptions object
* @return a LanguageTokenizer for language configured by opts
*/
public static LanguageTokenizer getLanguageTokenizer(Language language, TokenizerOptions opts, Version version) {
try {
if (language.equals(Language.getByLangCode("simple"))) language = Language.getByLangCode("en"); // simple english
if (tokenizerClasses.containsKey(language)) { // is just english
return (LanguageTokenizer) tokenizerClasses.get(language)
.getDeclaredConstructor(
Version.class,
TokenizerOptions.class,
Language.class)
.newInstance(
version,
opts,
language);
} else {
return new DefaultTokenizer(
version,
opts,
language);
}
} catch (Exception e) {
throw new RuntimeException(e); // These exceptions are based on hard code and should never get thrown
}
}
static {
tokenizerClasses = new HashMap<Language, Class>();
// These 26 tokenizers are functionally identical to Brent's code,
// except for Dutch (nl), which I modified a good deal
tokenizerClasses.put(Language.getByLangCode("en"), EnglishTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("de"), GermanTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("fr"), FrenchTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("nl"), DutchTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("it"), ItalianTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("pl"), PolishTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("es"), SpanishTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("ru"), RussianTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("ja"), JapaneseTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("pt"), PortugueseTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("zh"), ChineseTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("sv"), SwedishTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("uk"), UkrainianTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("ca"), CatalanTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("no"), NorwegianTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("fi"), FinnishTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("cs"), CzechTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("hu"), HungarianTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("ko"), KoreanTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("id"), IndonesianTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("tr"), TurkishTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("ro"), RomanianTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("sk"), SlovakTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("da"), DanishTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("he"), HebrewTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("lad"), LadinoTokenizer.class);
// I have added these 9 tokenizers myself
tokenizerClasses.put(Language.getByLangCode("ar"), ArabicTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("bg"), BulgarianTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("el"), GreekTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("eu"), BasqueTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("ga"), IrishTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("gl"), GalicianTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("hi"), HindiTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("hy"), ArmenianTokenizer.class);
tokenizerClasses.put(Language.getByLangCode("lv"), LatvianTokenizer.class);
// The following two tokenizers are of questionable functionality
// and are not currently implemented
// tokenizerClasses.put(Language.getByLangCode("fa"), PersianTokenizer.class);
// tokenizerClasses.put(Language.getByLangCode("th"), ThaiTokenizer.class);
}
protected static CharArraySet getStopWordsForNonLuceneLangFromFile(Version version, Language language) {
try{
String langCode = language.getLangCode();
String fileName = STOP_WORDS + langCode + ".txt";
CharArraySet charArraySet = new CharArraySet(version, 0, false);
File stopWordsFile = new File(fileName);
if (stopWordsFile.exists()) {
InputStream stream = FileUtils.openInputStream(new File(fileName));
List<String> stopWords = org.apache.commons.io.IOUtils.readLines(stream);
for (String stopWord : stopWords) {
charArraySet.add(stopWord);
}
}
return charArraySet;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}