package org.wikibrain.wikidata; import gnu.trove.map.TIntIntMap; import gnu.trove.set.TIntSet; import gnu.trove.set.hash.TIntHashSet; import org.apache.commons.cli.*; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; import org.wikibrain.conf.ConfigurationException; import org.wikibrain.conf.Configurator; import org.wikibrain.conf.DefaultOptionBuilder; import org.wikibrain.core.WikiBrainException; import org.wikibrain.core.cmd.Env; import org.wikibrain.core.cmd.EnvBuilder; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.dao.MetaInfoDao; import org.wikibrain.core.dao.UniversalPageDao; import org.wikibrain.core.lang.Language; import org.wikibrain.core.lang.LanguageSet; import org.wikibrain.download.FileDownloader; import org.wikibrain.parser.WpParseException; import org.wikibrain.utils.ParallelForEach; import org.wikibrain.utils.Procedure; import org.wikibrain.utils.WpIOUtils; import org.wikibrain.utils.WpThreadUtils; import java.io.File; import java.io.IOException; import java.net.URL; import java.sql.SQLException; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Load the contents of a dump into the various daos. */ public class WikidataDumpLoader { private static final Logger LOG = LoggerFactory.getLogger(WikidataDumpLoader.class); private final AtomicInteger counter = new AtomicInteger(); private final MetaInfoDao metaDao; private final WikidataDao wikidataDao; private final UniversalPageDao universalPageDao; private final LanguageSet languages; private final WikidataParser wdParser = new WikidataParser(); private final TIntSet universalIds; private boolean keepAllLabeledEntities = false; public WikidataDumpLoader(WikidataDao wikidataDao, MetaInfoDao metaDao, UniversalPageDao upDao, LanguageSet langs) throws DaoException { this.wikidataDao = wikidataDao; this.metaDao = metaDao; this.languages = langs; this.universalPageDao = upDao; Map<Language, TIntIntMap> localMaps = universalPageDao.getAllUnivToLocalIdsMap(languages); // Build up set of universal ids from the local ids that we know about this.universalIds = new TIntHashSet(); for(TIntIntMap langMap : localMaps.values()) { universalIds.addAll(langMap.keys()); } } /** * Expects file name format starting with lang + "wiki" for example, "enwiki" * * @param file */ public void load(final File file) throws IOException { LineIterator lines = new LineIterator(WpIOUtils.openBufferedReader(file)); ParallelForEach.iterate( lines, WpThreadUtils.getMaxThreads(), 1000, new Procedure<String>() { @Override public void call(String page) { try { save(file, page); metaDao.incrementRecords(WikidataEntity.class); } catch (WpParseException e) { LOG.warn("parsing of " + file.getPath() + " failed:", e); metaDao.incrementErrorsQuietly(WikidataEntity.class); } catch (DaoException e) { LOG.warn("parsing of " + file.getPath() + " failed:", e); metaDao.incrementErrorsQuietly(WikidataEntity.class); } } }, Integer.MAX_VALUE ); lines.close(); } private void save(File file, String json) throws WpParseException, DaoException { if (!json.contains("{")) { return; } json = json.trim(); if (json.endsWith(",")) { json = json.substring(0, json.length()-1); } if (counter.incrementAndGet() % 100000 == 0) { LOG.info("processing wikidata entity " + counter.get()); } WikidataEntity entity = wdParser.parse(json); // check if others use prune's boolean? entity.prune(languages); if (keepEntity(entity)) { wikidataDao.save(entity); } } private boolean keepEntity(WikidataEntity entity) { if (entity.getType() == WikidataEntity.Type.PROPERTY) { return true; } else if (universalIds.contains(entity.getId())) { return true; } else if (keepAllLabeledEntities && !entity.getLabels().isEmpty()) { return true; } else { return false; } } public void setKeepAllLabeledEntities(boolean keepAllLabeledEntities) { this.keepAllLabeledEntities = keepAllLabeledEntities; } public static void main(String args[]) throws ClassNotFoundException, SQLException, IOException, ConfigurationException, DaoException, WikiBrainException, java.text.ParseException, InterruptedException { Options options = new Options(); options.addOption( new DefaultOptionBuilder() .withLongOpt("drop-tables") .withDescription("drop and recreate all tables") .create("d")); options.addOption( new DefaultOptionBuilder() .withLongOpt("keep-labeled") .withDescription("keep all labeled entities") .create("k")); EnvBuilder.addStandardOptions(options); CommandLineParser parser = new PosixParser(); CommandLine cmd; try { cmd = parser.parse(options, args); } catch (ParseException e) { System.err.println("Invalid option usage: " + e.getMessage()); new HelpFormatter().printHelp("WikidataDumpLoader", options); return; } Env env = new EnvBuilder(cmd).build(); Configurator conf = env.getConfigurator(); File path; if (cmd.getArgList().isEmpty()) { WikidataDumpHelper helper = new WikidataDumpHelper(); // Fetch the file (if necessary) to the standard path String downloadDir = conf.getConf().get().getString("download.path"); File dest = FileUtils.getFile(downloadDir, helper.getMostRecentFile()); if (!dest.isFile()) { dest.getParentFile().mkdirs(); File tmp = File.createTempFile("wikibrain-wikidata", "json"); FileUtils.deleteQuietly(tmp); URL url = new URL(helper.getMostRecentUrl()); FileDownloader downloader = new FileDownloader(); downloader.download(url, tmp); if (dest.isFile()) { throw new IllegalStateException(); } FileUtils.moveFile(tmp, dest); } path = dest; } else if (cmd.getArgList().size() == 1) { path = new File(cmd.getArgList().get(0).toString()); } else { System.err.println("Invalid option usage:"); new HelpFormatter().printHelp("WikidataDumpLoader", options); return; } WikidataDao wdDao = conf.get(WikidataDao.class); UniversalPageDao upDao = conf.get(UniversalPageDao.class); MetaInfoDao metaDao = conf.get(MetaInfoDao.class); LanguageSet langs = conf.get(LanguageSet.class); WikidataDumpLoader loader = new WikidataDumpLoader(wdDao, metaDao, upDao, langs); if (cmd.hasOption("d")) { wdDao.clear(); metaDao.clear(WikidataStatement.class); } if (cmd.hasOption("k")) { loader.setKeepAllLabeledEntities(true); } wdDao.beginLoad(); metaDao.beginLoad(); loader.load(path); LOG.info("building indexes"); wdDao.endLoad(); metaDao.endLoad(); LOG.info("finished"); } }