package org.wikibrain.phrases;
import com.typesafe.config.Config;
import org.wikibrain.conf.Configuration;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.LocalPageDao;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.lang.LanguageSet;
import org.wikibrain.core.lang.LocalId;
import org.wikibrain.core.model.LocalPage;
import org.wikibrain.lucene.*;
import java.util.LinkedHashMap;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
* Using Lucene in a phrase analyzer.
* @author Yulun Li
public class LucenePhraseAnalyzer implements PhraseAnalyzer {
private static final Logger LOG = LoggerFactory.getLogger(PhraseAnalyzer.class);
* Analyzer retrieves this times as many results as needed and
* sums scores for all results. The effect of this variable is
* smoothing out and lowering probability scores in the returned results.
public static final int DOC_MULTIPLIER = 3;
private final LuceneSearcher searcher;
protected LocalPageDao localPageDao;
public LucenePhraseAnalyzer(LocalPageDao localPageDao, LuceneSearcher searcher) {
this.localPageDao = localPageDao;
this.searcher = searcher;
public LinkedHashMap<LocalId, Float> resolve(Language language, String phrase, int maxPages) throws DaoException {
LinkedHashMap<LocalId, Float> result = new LinkedHashMap<LocalId, Float>();
if (phrase == null || phrase.trim().equals("")) {
return result;
// query the title field for the phrase.
WikiBrainScoreDoc[] wikibrainScoreDocs = searcher.getQueryBuilderByLanguage(language)
.setPhraseQuery(new TextFieldElements().addTitle(), phrase)
.setNumHits(maxPages * DOC_MULTIPLIER)
if (wikibrainScoreDocs.length == 0) {
// If there is no result from title field query, query the plaintext field.
wikibrainScoreDocs = searcher.getQueryBuilderByLanguage(language)
.setPhraseQuery(new TextFieldElements().addPlainText(), phrase)
.setNumHits(maxPages * DOC_MULTIPLIER)
// When the phrase does not get an result from title or plaintext, this section
// creates a search query of substrings with at least 3 characters and concatenate
// them with space. This aims at phrases like "bioarchaeology" of which part
// of the phrase may get a result while the phrase as a whole may not.
if (wikibrainScoreDocs.length == 0 && !phrase.contains(" ") && phrase.length() > 3) {
String phraseMultiVersion = "";
for (int i = 1; i < phrase.length(); i++) {
phraseMultiVersion += (i > 2 ? phrase.substring(0, i) + " " : "");
phraseMultiVersion += (phrase.length() - i > 2 ? phrase.substring(i, phrase.length()) + " " : "");
wikibrainScoreDocs = searcher.getQueryBuilderByLanguage(language)
float totalScore = 0;
// The scores represent probability so they should sum to 1.
for (WikiBrainScoreDoc wikibrainScoreDoc : wikibrainScoreDocs) {
totalScore += wikibrainScoreDoc.score;
for (WikiBrainScoreDoc wikibrainScoreDoc : wikibrainScoreDocs) {
LocalId localId = new LocalId(language, wikibrainScoreDoc.wpId);
// if (result.isEmpty()) {
// System.out.println("top result for " + phrase + " is " + localPage.getTitle());
// }
result.put(localId, wikibrainScoreDoc.score / totalScore);
if (result.size() >= maxPages) {
return result;
* Use a Provider to get configuration in phrases.analyzer.
public static class Provider extends org.wikibrain.conf.Provider<PhraseAnalyzer> {
public Provider(Configurator configurator, Configuration config) throws ConfigurationException {
super(configurator, config);
public Class<PhraseAnalyzer> getType() {
return PhraseAnalyzer.class;
public String getPath() {
return "phrases.analyzer";
public PhraseAnalyzer get(String name, Config config, Map<String, String> runtimeParams) throws ConfigurationException {
if (!config.getString("type").equals("lucene")) {
return null;
LocalPageDao localPageDao = getConfigurator().get(LocalPageDao.class, config.getString("localPageDao"));
LanguageSet langs = getConfigurator().get(LanguageSet.class);
LuceneSearcher searcher = new LuceneSearcher(langs, getConfigurator().get(LuceneOptions.class));
return new LucenePhraseAnalyzer(localPageDao, searcher);
public LinkedHashMap<String, Float> describe(Language language, LocalPage page, int maxPhrases) throws DaoException {
throw new UnsupportedOperationException();
public int loadCorpus(LanguageSet langs) throws DaoException, IOException {
return -1;