package org.wikibrain.lucene;
import com.typesafe.config.Config;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.wikibrain.conf.Configuration;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.lang.LocalString;
import org.wikibrain.core.lang.StringNormalizer;
import org.wikibrain.lucene.tokenizers.LanguageTokenizer;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
*
* Normalizes a string into its canonical form using language-specific tokenizers.
*
* @author Shilad Sen
*/
public class LuceneStringNormalizer implements StringNormalizer {
private final Version version;
private final TokenizerOptions options;
private final Map<Language, LanguageTokenizer> tokenizers = new ConcurrentHashMap<Language, LanguageTokenizer>();
public LuceneStringNormalizer(TokenizerOptions options, Version version) {
this.options = options;
this.version = version;
}
public LanguageTokenizer getTokenizer(Language language) {
if (!tokenizers.containsKey(language)) {
tokenizers.put(language, LanguageTokenizer.getLanguageTokenizer(language, options, version));
}
return tokenizers.get(language);
}
@Override
public String normalize(LocalString string) {
return normalize(string.getLanguage(), string.getString());
}
@Override
public String normalize(Language language, String text) {
StringBuilder normalized = new StringBuilder();
try {
TokenStream stream = getTokenizer(language).getTokenStream(new StringReader(text));
CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
stream.reset();
while (stream.incrementToken()) {
if (normalized.length() > 0) {
normalized.append(' ');
}
normalized.append(cattr.toString());
}
stream.end();
stream.close();
return normalized.toString();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static class Provider extends org.wikibrain.conf.Provider<StringNormalizer> {
public Provider(Configurator configurator, Configuration config) throws ConfigurationException {
super(configurator, config);
}
@Override
public Class<StringNormalizer> getType() {
return StringNormalizer.class;
}
@Override
public String getPath() {
return "stringnormalizers";
}
@Override
public StringNormalizer get(String name, Config config, Map<String, String> runtimeParams) throws ConfigurationException {
if (!config.getString("type").equals("lucene")) {
return null;
}
Version version = Version.parseLeniently(config.getString("version"));
TokenizerOptions opts = new TokenizerOptions(
config.getBoolean("caseInsensitive"),
config.getBoolean("useStopWords"),
config.getBoolean("useStem")
);
return new LuceneStringNormalizer(opts, version);
}
}
}