package org.wikibrain.phrases;
import com.typesafe.config.Config;
import gnu.trove.map.TLongIntMap;
import gnu.trove.map.hash.TLongIntHashMap;
import org.apache.commons.collections.Predicate;
import org.apache.commons.collections.Transformer;
import org.apache.commons.collections.iterators.FilterIterator;
import org.apache.commons.collections.iterators.TransformIterator;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wikibrain.conf.Configuration;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.lang.LanguageSet;
import org.wikibrain.core.lang.StringNormalizer;
import org.wikibrain.core.model.RawPage;
import org.wikibrain.utils.*;
import java.io.File;
import java.io.IOException;
import java.util.*;
/**
* Persists information about phrases to page relationships using an object database.
*/
public class PhraseAnalyzerObjectDbDao implements PhraseAnalyzerDao {
private static final Logger LOG = LoggerFactory.getLogger(PhraseAnalyzerObjectDbDao.class);
private final Map<Language, PhraseAnalyzerLangDao> langDaos = new HashMap<Language, PhraseAnalyzerLangDao>();
private final File dir;
private final boolean isNew;
private final StringNormalizer normalizer;
/**
* Creates a new dao using the given directory.
* @param path
* @param isNew If true, delete any information contained in the directory.
* @throws DaoException
*/
public PhraseAnalyzerObjectDbDao(StringNormalizer normalizer, File path, boolean isNew) throws DaoException {
this.dir = path;
this.isNew = isNew;
this.normalizer = normalizer;
if (isNew) {
if (path.exists()) FileUtils.deleteQuietly(path);
path.mkdirs();
}
}
synchronized PhraseAnalyzerLangDao getDao(Language lang) throws DaoException {
File subDir = new File(dir, lang.getLangCode());
if (langDaos.containsKey(lang)) {
return langDaos.get(lang);
} else if (subDir.isDirectory() || isNew) {
langDaos.put(lang, new PhraseAnalyzerLangDao(normalizer, lang, subDir, isNew));
return langDaos.get(lang);
} else {
// throw new DaoException("No phrase dao available for " + lang);
return null;
}
}
@Override
public void savePageCounts(Language lang, int wpId, PrunedCounts<String> counts) throws DaoException {
getDao(lang).savePageCounts(wpId, counts);
}
@Override
public void savePhraseCounts(Language lang, String phrase, PrunedCounts<Integer> counts) throws DaoException {
getDao(lang).savePhraseCounts(phrase, counts);
}
@Override
public Iterator<String> getAllPhrases(final Language lang) {
try {
PhraseAnalyzerLangDao dao = getDao(lang);
return (dao == null)
? new ArrayList<String>().iterator()
: dao.getAllPhrases();
} catch (DaoException e) {
throw new RuntimeException(e);
}
}
@Override
public Iterator<Pair<String, PrunedCounts<Integer>>> getAllPhraseCounts(final Language lang) {
try {
PhraseAnalyzerLangDao dao = getDao(lang);
return (dao == null)
? new ArrayList<Pair<String, PrunedCounts<Integer>>>().iterator()
: dao.getAllPhraseCounts();
} catch (DaoException e) {
throw new RuntimeException(e);
}
}
@Override
public StringNormalizer getStringNormalizer() {
return normalizer;
}
/**
* Gets pages related to a phrase.
*
* @param lang
* @param phrase
* @param maxPages
* @return Map from page ids (in the local language) to the number of occurrences
* ordered by decreasing count.
* @throws DaoException
*/
@Override
public PrunedCounts<Integer> getPhraseCounts(Language lang, String phrase, int maxPages) throws DaoException {
PhraseAnalyzerLangDao dao = getDao(lang);
return (dao == null) ? null : dao.getPhraseCounts(phrase, maxPages);
}
/**
* Gets phrases related to a page.
* @param lang
* @param wpId Local page id
* @param maxPhrases
* @return Map from phrasese (in the local language) to the number of occurrences
* ordered by decreasing count.
* @throws DaoException
*/
@Override
public PrunedCounts<String> getPageCounts(Language lang, int wpId, int maxPhrases) throws DaoException {
PhraseAnalyzerLangDao dao = getDao(lang);
return (dao == null) ? null : dao.getPageCounts(wpId, maxPhrases);
}
@Override
public void flush() {
for (PhraseAnalyzerLangDao dao : langDaos.values()) {
dao.flush();
}
}
@Override
public void close() {
for (PhraseAnalyzerLangDao dao : langDaos.values()) {
dao.close();
}
}
public static class Provider extends org.wikibrain.conf.Provider<PhraseAnalyzerDao> {
public Provider(Configurator configurator, Configuration config) throws ConfigurationException {
super(configurator, config);
}
@Override
public Class<PhraseAnalyzerDao> getType() {
return PhraseAnalyzerDao.class;
}
@Override
public String getPath() {
return "phrases.dao";
}
@Override
public PhraseAnalyzerDao get(String name, Config config, Map<String, String> runtimeParams) throws ConfigurationException {
if (!config.getString("type").equals("objectdb")) {
return null;
}
boolean isNew = config.getBoolean("isNew");
File path = new File(getConfig().get().getString("phrases.path"), name);
StringNormalizer normalizer = getConfigurator().get(StringNormalizer.class, config.getString("normalizer"));
try {
return new PhraseAnalyzerObjectDbDao(normalizer, path, isNew);
} catch (DaoException e) {
throw new ConfigurationException(e);
}
}
}
}