package org.wikibrain.wikidata;
import com.google.gson.Gson;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import com.typesafe.config.Config;
import gnu.trove.set.TIntSet;
import gnu.trove.set.hash.TIntHashSet;
import org.apache.commons.collections.IteratorUtils;
import org.jooq.*;
import org.wikibrain.conf.Configuration;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.DaoFilter;
import org.wikibrain.core.dao.LocalPageDao;
import org.wikibrain.core.dao.UniversalPageDao;
import org.wikibrain.core.dao.sql.AbstractSqlDao;
import org.wikibrain.core.dao.sql.FastLoader;
import org.wikibrain.core.dao.sql.SimpleSqlDaoIterable;
import org.wikibrain.core.dao.sql.WpDataSource;
import org.wikibrain.core.jooq.Tables;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.lang.LocalId;
import org.wikibrain.core.model.LocalPage;
import org.wikibrain.core.model.UniversalPage;
import org.wikibrain.parser.WpParseException;
import java.io.File;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.wikibrain.core.jooq.tables.WikidataEntityAliases.*;
import static org.wikibrain.core.jooq.tables.WikidataEntityLabels.*;
import static org.wikibrain.core.jooq.tables.WikidataEntityDescriptions.*;
import static org.wikibrain.core.jooq.tables.WikidataStatement.*;
/**
* @author Shilad Sen
*/
public class WikidataSqlDao extends AbstractSqlDao<WikidataStatement> implements WikidataDao {
private static Language FALLBACK_LANGUAGE = Language.getByLangCode("en");
private static TableField[] FIELDS = new TableField[] {
WIKIDATA_STATEMENT.ID,
WIKIDATA_STATEMENT.ENTITY_TYPE,
WIKIDATA_STATEMENT.ENTITY_ID,
WIKIDATA_STATEMENT.PROP_ID,
WIKIDATA_STATEMENT.VAL_TYPE,
WIKIDATA_STATEMENT.VAL_STR,
WIKIDATA_STATEMENT.RANK
};
private final LocalPageDao lpDao;
private final UniversalPageDao upDao;
private Gson gson = new Gson();
private FastLoader labelLoader = null;
private FastLoader descLoader = null;
private FastLoader aliasLoader = null;
private Map<Integer, WikidataEntity> properties;
private WikidataParser parser = new WikidataParser();
/**
* @param dataSource Data source for jdbc connections
* @throws org.wikibrain.core.dao.DaoException
*/
public WikidataSqlDao(WpDataSource dataSource, LocalPageDao lpDao, UniversalPageDao upDao) throws DaoException {
super(dataSource, FIELDS, "/db/wikidata");
this.lpDao = lpDao;
this.upDao = upDao;
}
@Override
public WikidataEntity getProperty(Language language, String name) throws DaoException {
name = name.toLowerCase();
for (WikidataEntity entity : getProperties().values()) {
String ename = entity.getLabels().get(language);
if (ename != null && ename.toLowerCase().equals(name)) {
return entity;
}
}
return null;
}
@Override
public WikidataEntity getProperty(int id) throws DaoException {
Map<Integer, WikidataEntity> properties = getProperties(); // should be cached!
if (properties == null || properties.size() == 0) {
return getEntityWithoutCache(WikidataEntity.Type.PROPERTY, id);
} else {
return properties.get(id);
}
}
@Override
public WikidataEntity getItem(int id) throws DaoException {
return getEntityWithoutCache(WikidataEntity.Type.ITEM, id);
}
@Override
public synchronized Map<Integer, WikidataEntity> getProperties() throws DaoException {
if (properties != null) {
return properties;
}
if (cache != null) {
properties = (Map<Integer, WikidataEntity>) cache.get("wikidata-properties", WikidataEntity.class);
}
if (properties == null || properties.size() == 0) {
properties = new ConcurrentHashMap<Integer, WikidataEntity>();
LOG.info("creating wikidata properties cache. This only happens once...");
DSLContext context = getJooq();
try {
Result<Record1<Integer>> result = context
.select(Tables.WIKIDATA_ENTITY_LABELS.ENTITY_ID)
.from(Tables.WIKIDATA_ENTITY_LABELS)
.where(Tables.WIKIDATA_ENTITY_LABELS.ENTITY_TYPE.eq("" + WikidataEntity.Type.PROPERTY.code))
.fetch();
TIntSet propIds = new TIntHashSet();
for (Record1<Integer> record : result) {
propIds.add(record.value1());
}
for (int id : propIds.toArray()) {
properties.put(id, getEntityWithoutCache(WikidataEntity.Type.PROPERTY, id));
}
} finally {
freeJooq(context);
}
if (cache != null) {
cache.put("wikidata-properties", properties);
}
}
LOG.info("loaded properties with size " + ((properties == null) ? 0 : properties.size()));
return properties;
}
@Override
public Integer getItemId(LocalPage page) throws DaoException{
return upDao.getUnivPageId(page);
}
@Override
public Integer getItemId(LocalId localId) throws DaoException {
return upDao.getUnivPageId(localId.getLanguage(), localId.getId());
}
@Override
public UniversalPage getUniversalPage(int itemId) throws DaoException {
UniversalPage uPage = upDao.getById(itemId);
return uPage;
}
@Override
public List<WikidataStatement> getStatements(LocalPage page) throws DaoException {
int conceptId = upDao.getUnivPageId(page);
if (conceptId < 0) {
return new ArrayList<WikidataStatement>();
}
WikidataFilter filter = new WikidataFilter.Builder()
.withEntityType(WikidataEntity.Type.ITEM)
.withEntityId(conceptId)
.build();
return IteratorUtils.toList(get(filter).iterator());
}
public Collection<WikidataEntity> getPropertyByName(Language language, String name) throws DaoException {
List<WikidataEntity> matches = new ArrayList<WikidataEntity>();
Map<Integer, WikidataEntity> props = getProperties();
for (WikidataEntity e : props.values()) {
if (e.getAliases().containsKey(language) && e.getAliases().get(language).contains(name)) {
matches.add(e);
} else if (e.getLabels().containsKey(language) && e.getLabels().get(language).contains(name)) {
matches.add(e);
}
}
return matches;
}
public Collection<WikidataEntity> getPropertyByName(String name) throws DaoException {
Set<WikidataEntity> matches = new HashSet<WikidataEntity>();
Map<Integer, WikidataEntity> props = getProperties();
for (WikidataEntity e : props.values()) {
if (e.getLabels().values().contains(name)) {
matches.add(e);
continue;
}
if (e.getAliases().values().contains(name)) {
matches.add(e);
continue;
}
}
return matches;
}
@Override
public Map<String, List<LocalWikidataStatement>> getLocalStatements(LocalPage page) throws DaoException {
int conceptId = getItemId(page);
if (conceptId < 0) {
return new HashMap<String, List<LocalWikidataStatement>>();
}
return getLocalStatements(getRealLang(page.getLanguage()), WikidataEntity.Type.ITEM, conceptId);
}
private Language getRealLang(Language lang) {
if (lang.getLangCode().equals("simple")) {
return Language.getByLangCode("en");
} else {
return lang;
}
}
private WikidataEntity getEntityWithoutCache(WikidataEntity.Type type, int id) throws DaoException {
WikidataEntity entity = new WikidataEntity(type, id);
DSLContext jooq = getJooq();
try {
Result<Record2<Short, String>> result = jooq
.select(Tables.WIKIDATA_ENTITY_LABELS.LANG_ID, Tables.WIKIDATA_ENTITY_LABELS.LABEL)
.from(Tables.WIKIDATA_ENTITY_LABELS)
.where(Tables.WIKIDATA_ENTITY_LABELS.ENTITY_TYPE.eq(type.code + ""))
.and(Tables.WIKIDATA_ENTITY_LABELS.ENTITY_ID.eq(id))
.fetch();
for (Record2<Short, String> record : result) {
entity.getLabels().put(Language.getById(record.value1()), record.value2());
}
Result<Record2<Short, String>> result2 = jooq
.select(Tables.WIKIDATA_ENTITY_DESCRIPTIONS.LANG_ID, Tables.WIKIDATA_ENTITY_DESCRIPTIONS.DESCRIPTION)
.from(Tables.WIKIDATA_ENTITY_DESCRIPTIONS)
.where(Tables.WIKIDATA_ENTITY_DESCRIPTIONS.ENTITY_TYPE.eq(type.code + ""))
.and(Tables.WIKIDATA_ENTITY_DESCRIPTIONS.ENTITY_ID.eq(id))
.fetch();
for (Record2<Short, String> record : result2) {
entity.getDescriptions().put(Language.getById(record.value1()), record.value2());
}
Result<Record2<Short, String>> result3 = jooq
.select(Tables.WIKIDATA_ENTITY_ALIASES.LANG_ID, Tables.WIKIDATA_ENTITY_ALIASES.ALIAS)
.from(Tables.WIKIDATA_ENTITY_ALIASES)
.where(Tables.WIKIDATA_ENTITY_ALIASES.ENTITY_TYPE.eq(type.code + ""))
.and(Tables.WIKIDATA_ENTITY_ALIASES.ENTITY_ID.eq(id))
.fetch();
Map<Language, List<String>> aliases = entity.getAliases();
for (Record2<Short, String> record : result3) {
Language lang = Language.getById(record.value1());
if (!aliases.containsKey(lang)) {
aliases.put(lang, new ArrayList<String>());
}
aliases.get(lang).add(record.value2());
}
WikidataFilter filter = new WikidataFilter.Builder()
.withEntityType(type)
.withEntityId(id)
.build();
for (WikidataStatement st : get(filter)) {
if (st != null) {
entity.getStatements().add(st);
}
}
return entity;
} finally {
freeJooq(jooq);
}
}
@Override
public void beginLoad() throws DaoException {
super.beginLoad();
if (labelLoader == null) {
labelLoader = new FastLoader(wpDs, new TableField[] {
WIKIDATA_ENTITY_LABELS.ENTITY_TYPE,
WIKIDATA_ENTITY_LABELS.ENTITY_ID,
WIKIDATA_ENTITY_LABELS.LANG_ID,
WIKIDATA_ENTITY_LABELS.LABEL,
});
}
if (descLoader == null) {
descLoader = new FastLoader(wpDs, new TableField[] {
WIKIDATA_ENTITY_DESCRIPTIONS.ENTITY_TYPE,
WIKIDATA_ENTITY_DESCRIPTIONS.ENTITY_ID,
WIKIDATA_ENTITY_DESCRIPTIONS.LANG_ID,
WIKIDATA_ENTITY_DESCRIPTIONS.DESCRIPTION,
});
}
if (aliasLoader == null) {
aliasLoader = new FastLoader(wpDs, new TableField[] {
WIKIDATA_ENTITY_ALIASES.ENTITY_TYPE,
WIKIDATA_ENTITY_ALIASES.ENTITY_ID,
WIKIDATA_ENTITY_ALIASES.LANG_ID,
WIKIDATA_ENTITY_ALIASES.ALIAS
});
}
properties = new HashMap<Integer, WikidataEntity>();
}
@Override
public void save(WikidataEntity entity) throws DaoException {
for (Map.Entry<Language, String> entry : entity.getLabels().entrySet()) {
labelLoader.load(entity.getType().code, entity.getId(), entry.getKey().getId(), entry.getValue());
}
for (Map.Entry<Language, String> entry : entity.getDescriptions().entrySet()) {
descLoader.load(entity.getType().code, entity.getId(), entry.getKey().getId(), entry.getValue());
}
for (Map.Entry<Language, List<String>> entry : entity.getAliases().entrySet()) {
for (String alias : entry.getValue()) {
aliasLoader.load(entity.getType().code, entity.getId(), entry.getKey().getId(), alias);
}
}
for (WikidataStatement stmt : entity.getStatements()) {
save(stmt);
}
if (entity.getType() == WikidataEntity.Type.PROPERTY) {
synchronized (properties) {
properties.put(entity.getId(), entity);
}
}
}
@Override
public void save(WikidataStatement item) throws DaoException {
insert(
item.getId(),
item.getItem().getType().code,
item.getItem().getId(),
item.getProperty().getId(),
item.getValue().getTypeName().toLowerCase(),
encodeValue(item.getValue()),
item.getRank().ordinal()
);
}
private String encodeValue(WikidataValue value) {
return gson.toJson(value.getJsonValue());
}
@Override
public void endLoad() throws DaoException {
if (labelLoader != null) labelLoader.endLoad();
if (descLoader != null) descLoader.endLoad();
if (aliasLoader != null) aliasLoader.endLoad();
labelLoader = null;
descLoader = null;
aliasLoader = null;
super.endLoad();
if (cache != null) {
cache.put("wikidata-properties", properties);
}
wpDs.optimize(WIKIDATA_ENTITY_LABELS);
wpDs.optimize(WIKIDATA_ENTITY_ALIASES);
wpDs.optimize(WIKIDATA_ENTITY_DESCRIPTIONS);
wpDs.optimize(WIKIDATA_STATEMENT);
}
@Override
public Map<String, List<LocalWikidataStatement>> getLocalStatements(Language lang, WikidataEntity.Type type, int id) throws DaoException {
lang = getRealLang(lang);
WikidataFilter filter = new WikidataFilter.Builder()
.withEntityType(type)
.withEntityId(id)
.build();
Map<String, List<LocalWikidataStatement>> local = new HashMap<String, List<LocalWikidataStatement>>();
for (WikidataStatement st : get(filter)) {
LocalWikidataStatement lws = getLocalStatement(lang, st);
if (!local.containsKey(lws.getProperty())) {
local.put(lws.getProperty(), new ArrayList<LocalWikidataStatement>());
}
local.get(lws.getProperty()).add(lws);
}
return local;
}
@Override
public LocalWikidataStatement getLocalStatement(Language language, WikidataStatement statement) throws DaoException {
language = getRealLang(language );
String item = getLabel(language, statement.getItem().getType(), statement.getItem().getId());
String prop = getLabel(language, statement.getProperty().getType(), statement.getProperty().getId());
String value = null;
WikidataValue wdv = statement.getValue();
if (wdv.getType() == WikidataValue.Type.ITEM) {
value = getLabel(language, WikidataEntity.Type.ITEM, wdv.getItemValue());
} else if (wdv.getValue() == null) {
value = "unknown";
} else {
value = wdv.getValue().toString();
}
String full = item + " " + prop + " " + value;
return new LocalWikidataStatement(language, statement, full, item, prop, value);
}
@Override
public String getLabel(Language language, WikidataEntity.Type type, int id) throws DaoException {
if (type == WikidataEntity.Type.PROPERTY) {
WikidataEntity prop = getProperty(id); // should be cached, fast
if (prop.getLabels().isEmpty()) {
LOG.warn("no labels for property " + id);
return "unknown";
}
if (prop.getLabels().containsKey(language)) {
return prop.getLabels().get(language);
} else if (prop.getLabels().containsKey(FALLBACK_LANGUAGE)) {
return prop.getLabels().get(FALLBACK_LANGUAGE);
} else {
return prop.getLabels().values().iterator().next();
}
} else if (type == WikidataEntity.Type.ITEM) {
DSLContext jooq = getJooq();
try {
for (Language l : Arrays.asList(language, FALLBACK_LANGUAGE)) {
Result<Record1<String>> result = jooq.select(Tables.WIKIDATA_ENTITY_LABELS.LABEL)
.from(Tables.WIKIDATA_ENTITY_LABELS)
.where(Tables.WIKIDATA_ENTITY_LABELS.ENTITY_TYPE.eq(""+type.code))
.and(WIKIDATA_ENTITY_LABELS.ENTITY_ID.eq(id))
.and(WIKIDATA_ENTITY_LABELS.LANG_ID.eq(l.getId()))
.fetch();
if (result.size() >= 1) {
return result.get(0).value1();
}
}
Result<Record1<String>> result = jooq.select(Tables.WIKIDATA_ENTITY_LABELS.LABEL)
.from(Tables.WIKIDATA_ENTITY_LABELS)
.where(Tables.WIKIDATA_ENTITY_LABELS.ENTITY_TYPE.eq(""+type.code))
.and(WIKIDATA_ENTITY_LABELS.ENTITY_ID.eq(id))
.limit(1)
.fetch();
if (result.size() >= 1) {
return result.get(0).value1();
} else {
LOG.warn("no labels for item " + id);
return "unknown";
}
} finally {
freeJooq(jooq);
}
} else {
throw new IllegalArgumentException("Unknown entity type: " + type);
}
}
@Override
public Iterable<WikidataStatement> getByValue(WikidataEntity property, WikidataValue value) throws DaoException {
WikidataFilter filter = new WikidataFilter.Builder()
.withPropertyId(property.getId())
.withValue(value)
.build();
return get(filter);
}
@Override
public Iterable<WikidataStatement> getByValue(String propertyName, WikidataValue value) throws DaoException {
Set<Integer> propIds = new HashSet<Integer>();
for (WikidataEntity e : getPropertyByName(propertyName)) {
propIds.add(e.getId());
}
if (propIds.isEmpty()) {
return new ArrayList<WikidataStatement>();
}
WikidataFilter filter = new WikidataFilter.Builder()
.withPropertyIds(propIds)
.withValue(value)
.build();
return get(filter);
}
@Override
public Set<Integer> conceptsWithValue(String propertyName, WikidataValue value) throws DaoException {
Set<Integer> concepts = new HashSet<Integer>();
for (WikidataStatement st : getByValue(propertyName, value)) {
if (st.getItem().getType() == WikidataEntity.Type.ITEM) {
concepts.add(st.getItem().getId());
}
}
return concepts;
}
@Override
public Set<LocalId> pagesWithValue(String propertyName, WikidataValue value, Language language) throws DaoException {
Set<LocalId> ids = new HashSet<LocalId>();
for (int conceptId : conceptsWithValue(propertyName, value)) {
UniversalPage up = upDao.getById(conceptId);
if (up != null && up.isInLanguage(language)) {
ids.add(new LocalId(language, up.getLocalId(language)));
}
}
return ids;
}
@Override
public Iterable<WikidataStatement> get(WikidataFilter filter) throws DaoException {
List<Condition> conditions = new ArrayList<Condition>();
if (filter.getLangIds() != null) {
throw new UnsupportedOperationException("Filter doesn't support lang ids yet");
}
if (filter.getEntityTypes() != null) {
conditions.add(WIKIDATA_STATEMENT.ENTITY_TYPE.in(filter.getEntityTypeCodes()));
}
if (filter.getEntityIds() != null) {
conditions.add(WIKIDATA_STATEMENT.ENTITY_ID.in(filter.getEntityIds()));
}
if (filter.getPropertyIds() != null) {
conditions.add(WIKIDATA_STATEMENT.PROP_ID.in(filter.getPropertyIds()));
}
if (filter.getRanks() != null) {
conditions.add(WIKIDATA_STATEMENT.RANK.in(filter.getRankOrdinals()));
}
if (filter.getValues() != null) {
String type = null;
List<String> values = new ArrayList<String>();
for (WikidataValue value : filter.getValues()) {
values.add(encodeValue(value));
if (type == null) {
type = value.getTypeName();
}
if (!type.equals(value.getTypeName())) {
throw new IllegalArgumentException("All wikidata filter values must have the same type");
}
}
conditions.add(WIKIDATA_STATEMENT.VAL_TYPE.eq(type.toLowerCase()).and(WIKIDATA_STATEMENT.VAL_STR.in(values)));
}
DSLContext jooq = getJooq();
try {
// System.err.println("EXECUTING " + jooq.select().from(Tables.WIKIDATA_STATEMENT).where(conditions).getSQL());
Cursor<Record> result = jooq.select().
from(Tables.WIKIDATA_STATEMENT).
where(conditions).fetchLazy(getFetchSize());
return new SimpleSqlDaoIterable<WikidataStatement>(result, jooq) {
@Override
public WikidataStatement transform(Record r) {
try {
return buildStatement(r);
} catch (DaoException e) {
LOG.warn(e.getMessage(), e);
return null;
}
}
};
} finally {
// freeJooq(jooq);
}
}
@Override
public Iterable<WikidataStatement> get(DaoFilter daoFilter) throws DaoException {
throw new UnsupportedOperationException();
}
@Override
public int getCount(DaoFilter daoFilter) throws DaoException {
throw new UnsupportedOperationException();
}
protected WikidataStatement buildStatement(Record record) throws DaoException {
if (record == null) {
return null;
}
WikidataEntity item = new WikidataEntity(
WikidataEntity.Type.getByCode(record.getValue(Tables.WIKIDATA_STATEMENT.ENTITY_TYPE).charAt(0)),
record.getValue(Tables.WIKIDATA_STATEMENT.ENTITY_ID)
);
WikidataEntity prop = getProperty(record.getValue(Tables.WIKIDATA_STATEMENT.PROP_ID));
Short rankOrdinal = record.getValue(Tables.WIKIDATA_STATEMENT.RANK);
JsonElement json = new JsonParser().parse(record.getValue(Tables.WIKIDATA_STATEMENT.VAL_STR));
WikidataValue val;
try {
val = parser.jsonToValue( record.getValue(Tables.WIKIDATA_STATEMENT.VAL_TYPE), json);
} catch (WpParseException e) {
throw new DaoException(e);
}
WikidataStatement stmt = new WikidataStatement(
record.getValue(Tables.WIKIDATA_STATEMENT.ID),
item,
prop,
val,
WikidataStatement.Rank.values()[rankOrdinal]
);
return stmt;
}
public static class Provider extends org.wikibrain.conf.Provider<WikidataDao> {
public Provider(Configurator configurator, Configuration config) throws ConfigurationException {
super(configurator, config);
}
@Override
public Class<WikidataDao> getType() {
return WikidataDao.class;
}
@Override
public String getPath() {
return "dao.wikidata";
}
@Override
public WikidataDao get(String name, Config config, Map<String, String> runtimeParams) throws ConfigurationException {
if (!config.getString("type").equals("sql")) {
return null;
}
try {
WikidataSqlDao dao = new WikidataSqlDao(
getConfigurator().get(
WpDataSource.class,
config.getString("dataSource")),
getConfigurator().get(LocalPageDao.class),
getConfigurator().get(UniversalPageDao.class)
);
String cachePath = getConfig().get().getString("dao.sqlCachePath");
File cacheDir = new File(cachePath);
if (!cacheDir.isDirectory()) {
cacheDir.mkdirs();
}
dao.useCache(cacheDir);
return dao;
} catch (DaoException e) {
throw new ConfigurationException(e);
}
}
}
}