package org.wikibrain.phrases;
import com.typesafe.config.Config;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.Fraction;
import org.wikibrain.conf.Configuration;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.LocalPageDao;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.lang.LanguageSet;
import org.wikibrain.download.FileDownloader;
import org.wikibrain.utils.WpIOUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Loads phrase to page files from Indexes files from
* http://www-nlp.stanford.edu/pubs/crosswikis-data.tar.bz2/
* into a PhraseAnalyzer
*
* These files capture anchor phrase associated with web pages that link to Wikipedia.
* Note that the pages with anchor phrase are not (usually) Wikipedia pages themselves.
*/
public class StanfordPhraseAnalyzer extends BasePhraseAnalyzer {
private static final Logger LOG = LoggerFactory.getLogger(StanfordPhraseAnalyzer.class);
private static final Language LANG_EN = Language.getByLangCode("en");
private static final Language LANG_SIMPLE = Language.getByLangCode("simple");
private final File path;
private LanguageSet languages;
public StanfordPhraseAnalyzer(PhraseAnalyzerDao phraseDao, LocalPageDao pageDao, PrunedCounts.Pruner<String> phrasePruner, PrunedCounts.Pruner<Integer> pagePruner, File path) {
super(phraseDao, pageDao, phrasePruner, pagePruner);
this.path = path;
}
/**
* Loads a single Stanford phrase file into the database.
* This can safely be called for multiple files if it is chunked.
* @throws IOException
*/
@Override
protected Iterable<BasePhraseAnalyzer.Entry> getCorpus(LanguageSet langs) throws IOException, DaoException {
for (Language l : langs) {
if (l != LANG_EN && l != LANG_SIMPLE) {
LOG.warn("Stanford only supports English and Simple English (not " + l + ")");
}
}
this.languages = langs;
return new Iterable<Entry>() {
@Override
public Iterator<Entry> iterator() {
try {
return new Iter();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
}
protected class Iter implements Iterator<BasePhraseAnalyzer.Entry> {
BufferedReader reader;
List<Entry> buffer = new ArrayList<Entry>();
boolean eof = false;
public Iter() throws IOException {
reader = WpIOUtils.openBufferedReader(path);
}
@Override
public boolean hasNext() {
fillBuffer();
return !buffer.isEmpty();
}
@Override
public BasePhraseAnalyzer.Entry next() {
fillBuffer();
if (buffer.isEmpty()) {
return null;
} else {
return buffer.remove(0);
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
private void fillBuffer() {
if (!buffer.isEmpty() || eof) {
return;
}
while (!eof && buffer.isEmpty()) {
try {
parseNextLine();
} catch (IOException e) {
throw new RuntimeException(e);
} catch (Exception e) {
LOG.debug("Error parsing line:", e);
}
}
}
private void parseNextLine() throws IOException {
if (!buffer.isEmpty()) throw new IllegalStateException();
String line = reader.readLine();
if (line == null) {
IOUtils.closeQuietly(reader);
eof = true;
return;
}
Record r = new Record(line);
for (Language l : Arrays.asList(LANG_EN, LANG_SIMPLE)) {
if (languages.containsLanguage(l)) {
buffer.add(
new BasePhraseAnalyzer.Entry(
l, r.article, r.phrase, r.getNumEnglishLinks()));
}
}
}
}
/**
* A single entry corresponding to a line from a
* dictionary.bz2 at http://www-nlp.stanford.edu/pubs/crosswikis-data.tar.bz2/.
*
* Major components of an entry are:
* - textual phrase
* - concept (a wikipedia article)
* - A variety of flags
*/
private static final Pattern MATCH_ENTRY = Pattern.compile("([^\t]*)\t([0-9.e-]+) ([^ ]*)(| (.*))$");
class Record {
String phrase;
float fraction;
String article;
String flags[];
Record(String line) {
Matcher m = MATCH_ENTRY.matcher(line);
if (!m.matches()) {
throw new IllegalArgumentException("invalid concepts entry: '" + line + "'");
}
this.phrase = m.group(1);
this.fraction = Float.valueOf(m.group(2));
this.article = m.group(3);
this.flags = m.group(4).trim().split(" ");
}
int getNumEnglishLinks() {
for (String flag : flags) {
if (flag.startsWith("W:")) {
return Fraction.getFraction(flag.substring(2)).getNumerator();
}
}
return 0;
}
}
public static void downloadDictionaryIfNecessary(Configuration conf) throws IOException, InterruptedException {
String path = conf.get().getString("phrases.analyzer.stanford.path");
String url = conf.get().getString("phrases.analyzer.stanford.url");
File file = new File(path);
File completed = new File(path + ".completed");
if (!completed.isFile()) {
LOG.info("downloading stanford dictionary...");
FileDownloader downloader = new FileDownloader();
downloader.download(new URL(url), file);
FileUtils.touch(completed);
}
}
public static class Provider extends org.wikibrain.conf.Provider<PhraseAnalyzer> {
public Provider(Configurator configurator, Configuration config) throws ConfigurationException {
super(configurator, config);
}
@Override
public Class getType() {
return PhraseAnalyzer.class;
}
@Override
public String getPath() {
return "phrases.analyzer";
}
@Override
public PhraseAnalyzer get(String name, Config config, Map<String, String> runtimeParams) throws ConfigurationException {
if (!config.getString("type").equals("stanford")) {
return null;
}
PhraseAnalyzerDao paDao = getConfigurator().construct(
PhraseAnalyzerDao.class, name, config.getConfig("dao"),
new HashMap<String, String>());
LocalPageDao lpDao = getConfigurator().get(LocalPageDao.class, config.getString("localPageDao"));
File path = new File(config.getString("path"));
PrunedCounts.Pruner<String> phrasePruner = getConfigurator().construct(
PrunedCounts.Pruner.class, null, config.getConfig("phrasePruner"), null);
PrunedCounts.Pruner<Integer> pagePruner = getConfigurator().construct(
PrunedCounts.Pruner.class, null, config.getConfig("pagePruner"), null);
return new StanfordPhraseAnalyzer(paDao, lpDao, phrasePruner, pagePruner, path);
}
}
}