package org.wikibrain.loader; import gnu.trove.set.TIntSet; import gnu.trove.set.hash.TIntHashSet; import org.apache.commons.cli.*; import org.apache.commons.io.FileUtils; import org.apache.commons.io.comparator.SizeFileComparator; import org.wikibrain.conf.ConfigurationException; import org.wikibrain.conf.Configurator; import org.wikibrain.conf.DefaultOptionBuilder; import org.wikibrain.core.cmd.Env; import org.wikibrain.core.cmd.EnvBuilder; import org.wikibrain.core.cmd.FileMatcher; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.dao.LocalPageDao; import org.wikibrain.core.dao.MetaInfoDao; import org.wikibrain.core.dao.RawPageDao; import org.wikibrain.core.dao.sql.WpDataSource; import org.wikibrain.core.lang.Language; import org.wikibrain.core.lang.LanguageInfo; import org.wikibrain.core.model.LocalPage; import org.wikibrain.core.model.NameSpace; import org.wikibrain.core.model.RawPage; import org.wikibrain.parser.DumpSplitter; import org.wikibrain.parser.WpParseException; import org.wikibrain.parser.xml.PageXmlParser; import org.wikibrain.utils.ParallelForEach; import org.wikibrain.utils.Procedure; import org.wikibrain.utils.WpThreadUtils; import java.io.*; import java.sql.SQLException; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Load the contents of a dump into the various daos. */ public class DumpLoader { private static final Logger LOG = LoggerFactory.getLogger(DumpLoader.class); public static final List<NameSpace> DEFAULT_NAMESPACES = Arrays.asList(NameSpace.ARTICLE, NameSpace.CATEGORY); private final AtomicInteger allPages = new AtomicInteger(); private final AtomicInteger interestingPages = new AtomicInteger(); private final Collection<NameSpace> nss; // If there are a maximum number of articles per language, langCounters will track counts per langauge private Integer maxPerLang = null; private final Map<Language, AtomicInteger> langCounters = new ConcurrentHashMap<Language, AtomicInteger>(); private final LocalPageDao localPageDao; private final RawPageDao rawPageDao; private final MetaInfoDao metaDao; private TIntSet validIds = null; public DumpLoader(LocalPageDao localPageDao, RawPageDao rawPageDao, MetaInfoDao metaDao) { this(localPageDao, rawPageDao, metaDao, DEFAULT_NAMESPACES); } public DumpLoader(LocalPageDao localPageDao, RawPageDao rawPageDao, MetaInfoDao metaDao, Collection<NameSpace> nss) { this.localPageDao = localPageDao; this.rawPageDao = rawPageDao; this.metaDao = metaDao; this.nss = nss; } public void setValidIds(TIntSet validIds) { this.validIds = validIds; } /** * Expects file name format starting with lang + "wiki" for example, "enwiki" * @param file */ public void load(final File file) { final Language lang = FileMatcher.ARTICLES.getLanguage(file.getAbsolutePath()); if (!keepProcessingArticles(lang)) { return; } DumpSplitter parser = new DumpSplitter(file); ParallelForEach.iterate( parser.iterator(), WpThreadUtils.getMaxThreads(), 1000, new Procedure<String>() { @Override public void call(String page) throws Exception { try { processOnePage(file, lang, page); } catch (WpParseException e) { LOG.warn("parsing of " + file.getPath() + " failed:", e); } } }, Integer.MAX_VALUE ); } private void processOnePage(File file, Language lang, String page) throws WpParseException { if (!keepProcessingArticles(lang)) { return; } if (allPages.incrementAndGet() % 10000 == 0) { LOG.info("processing article " + allPages.get() + " found " + interestingPages.get() + " interesting articles"); } PageXmlParser parser = new PageXmlParser(LanguageInfo.getByLanguage(lang)); RawPage rp = parser.parse(page); if (isInteresting(rp)) { interestingPages.incrementAndGet(); save(file, rp); incrementLangCount(lang); } } private boolean isInteresting(RawPage rp) { if (rp == null || rp.getNamespace() == null) { return false; } else if (validIds != null && !validIds.contains(rp.getLocalId())) { return false; } else { return nss.contains(rp.getNamespace()); } } private boolean keepProcessingArticles(Language lang) { if (maxPerLang == null) { return true; } else if (!langCounters.containsKey(lang)) { return true; } else { return langCounters.get(lang).get() < maxPerLang; } } private void incrementLangCount(Language lang) { if (maxPerLang != null) { if (!langCounters.containsKey(lang)) { synchronized (langCounters) { if (!langCounters.containsKey(lang)) { langCounters.put(lang, new AtomicInteger()); } } } langCounters.get(lang).incrementAndGet(); } } private void save(File file, RawPage rp) { try { rawPageDao.save(rp); metaDao.incrementRecords(rp.getClass(), rp.getLanguage()); } catch (Exception e) { LOG.warn("parsing of " + file + " failed:", e); metaDao.incrementErrorsQuietly(rp.getClass(), rp.getLanguage()); } try { LocalPage lp = new LocalPage( rp.getLanguage(), rp.getLocalId(), rp.getTitle(), rp.getNamespace(), rp.isRedirect(), rp.isDisambig() ); localPageDao.save(lp); metaDao.incrementRecords(lp.getClass(), lp.getLanguage()); } catch (Exception e) { LOG.warn("parsing of " + file + " failed:", e); metaDao.incrementErrorsQuietly(LocalPage.class, rp.getLanguage()); } } public static void main(String args[]) throws ClassNotFoundException, SQLException, IOException, ConfigurationException, DaoException { Options options = new Options(); options.addOption( new DefaultOptionBuilder() .withLongOpt("drop-tables") .withDescription("drop and recreate all tables") .create("d")); options.addOption( new DefaultOptionBuilder() .withLongOpt("max-articles") .hasArg() .withDescription("maximum articles per language") .create("x")); options.addOption( new DefaultOptionBuilder() .withLongOpt("validIds") .hasArg() .withDescription("list of valid ids") .create("v")); EnvBuilder.addStandardOptions(options); CommandLineParser parser = new PosixParser(); CommandLine cmd; try { cmd = parser.parse(options, args); } catch (ParseException e) { System.err.println("Invalid option usage: " + e.getMessage()); new HelpFormatter().printHelp("DumpLoader", options); System.exit(1); return; } EnvBuilder builder = new EnvBuilder(cmd); if (!builder.hasExplicitLanguageSet()) { builder.setUseDownloadedLanguages(); } Env env = builder.build(); Configurator conf = env.getConfigurator(); List<File> paths; if (cmd.getArgList().isEmpty()) { paths = env.getFiles(FileMatcher.ARTICLES); } else { paths = new ArrayList<File>(); for (Object arg : cmd.getArgList()) { paths.add(new File((String)arg)); } } // Schedule the biggest files first to improve parallel performance Collections.sort(paths, SizeFileComparator.SIZE_REVERSE); LocalPageDao lpDao = conf.get(LocalPageDao.class); RawPageDao rpDao = conf.get(RawPageDao.class); MetaInfoDao metaDao = conf.get(MetaInfoDao.class); final DumpLoader loader = new DumpLoader(lpDao, rpDao, metaDao); if (cmd.hasOption("x")) { loader.maxPerLang = Integer.valueOf(cmd.getOptionValue("x")); } if (cmd.hasOption("v")) { TIntSet validIds = new TIntHashSet(); for (String line : FileUtils.readLines(new File(cmd.getOptionValue("v")))) { validIds.add(Integer.valueOf(line.trim())); } loader.setValidIds(validIds); } if (cmd.hasOption("d")) { lpDao.clear(); rpDao.clear(); metaDao.clear(); } lpDao.beginLoad(); rpDao.beginLoad(); metaDao.beginLoad(); // loads multiple dumps in parallel for (File path : paths) { LOG.info("processing file: " + path); loader.load(path); } lpDao.endLoad(); rpDao.endLoad(); metaDao.endLoad(); } }