package org.wikibrain.phrases;
import com.typesafe.config.Config;
import org.wikibrain.conf.Configuration;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.DaoFilter;
import org.wikibrain.core.dao.LocalLinkDao;
import org.wikibrain.core.dao.LocalPageDao;
import org.wikibrain.core.lang.LanguageSet;
import org.wikibrain.core.lang.StringNormalizer;
import org.wikibrain.core.model.LocalLink;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Loads phrase to page mapping using anchor phrase in wiki links.
*/
public class AnchorTextPhraseAnalyzer extends BasePhraseAnalyzer {
private static final Logger LOG = LoggerFactory.getLogger(AnchorTextPhraseAnalyzer.class);
private LocalLinkDao linkDao;
public AnchorTextPhraseAnalyzer(PhraseAnalyzerDao phraseDao, LocalPageDao pageDao, LocalLinkDao linkDao, PrunedCounts.Pruner<String> phrasePruner, PrunedCounts.Pruner<Integer> pagePruner) {
super(phraseDao, pageDao, phrasePruner, pagePruner);
this.linkDao = linkDao;
}
/**
* Loads language links into the database.
*/
@Override
public Iterable<BasePhraseAnalyzer.Entry> getCorpus(final LanguageSet langs) throws IOException, DaoException {
return new Iterable<BasePhraseAnalyzer.Entry>() {
@Override
public Iterator<BasePhraseAnalyzer.Entry> iterator() {
try {
return new Iter(linkDao.get(new DaoFilter().setLanguages(langs)).iterator());
} catch (DaoException e) {
throw new RuntimeException(e);
}
}
};
}
public class Iter implements Iterator<BasePhraseAnalyzer.Entry> {
Iterator<LocalLink> iter;
private BasePhraseAnalyzer.Entry buffer = null;
private boolean finished = false;
Iter(Iterator<LocalLink> iter) {
this.iter = iter;
}
@Override
public boolean hasNext() {
if (buffer != null) {
return true;
}
if (finished) {
return false;
}
fillBuffer();
return buffer != null;
}
@Override
public BasePhraseAnalyzer.Entry next() {
fillBuffer();
BasePhraseAnalyzer.Entry tmp = buffer;
buffer = null;
return tmp;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
private void fillBuffer() {
if (finished || buffer != null) {
return;
}
if (!iter.hasNext()) {
finished = true;
return;
}
LocalLink ll = iter.next();
if (ll == null) {
finished = true;
return;
}
buffer = new BasePhraseAnalyzer.Entry(
ll.getLanguage(), ll.getDestId(), ll.getAnchorText(), 1
);
}
}
public static class Provider extends org.wikibrain.conf.Provider<PhraseAnalyzer> {
public Provider(Configurator configurator, Configuration config) throws ConfigurationException {
super(configurator, config);
}
@Override
public Class getType() {
return PhraseAnalyzer.class;
}
@Override
public String getPath() {
return "phrases.analyzer";
}
@Override
public PhraseAnalyzer get(String name, Config config, Map<String, String> runtimeParams) throws ConfigurationException {
if (!config.getString("type").equals("anchortext")) {
return null;
}
PhraseAnalyzerDao paDao = getConfigurator().construct(
PhraseAnalyzerDao.class, name, config.getConfig("dao"),
new HashMap<String, String>());
LocalPageDao lpDao = getConfigurator().get(LocalPageDao.class, config.getString("localPageDao"));
LocalLinkDao llDao = getConfigurator().get(LocalLinkDao.class, config.getString("localLinkDao"));
PrunedCounts.Pruner<String> phrasePruner = getConfigurator().construct(
PrunedCounts.Pruner.class, null, config.getConfig("phrasePruner"), null);
PrunedCounts.Pruner<Integer> pagePruner = getConfigurator().construct(
PrunedCounts.Pruner.class, null, config.getConfig("pagePruner"), null);
return new AnchorTextPhraseAnalyzer(paDao, lpDao, llDao, phrasePruner, pagePruner);
}
}
}