package org.wikibrain.loader;
import org.apache.commons.cli.*;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.conf.DefaultOptionBuilder;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.cmd.FileMatcher;
import org.wikibrain.core.dao.*;
import org.wikibrain.core.dao.sql.WpDataSource;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.lang.LanguageInfo;
import org.wikibrain.core.lang.LanguageSet;
import org.wikibrain.core.model.InterLanguageLink;
import org.wikibrain.core.model.LocalCategoryMember;
import org.wikibrain.core.model.LocalLink;
import org.wikibrain.parser.wiki.*;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Parses the wiki text associated with articles
* and populates data stores for links, ills, and categories.
*/
public class WikiTextLoader {
private static final Logger LOG = LoggerFactory.getLogger(WikiTextLoader.class);
public static void main(String args[]) throws ConfigurationException, DaoException, IOException {
Options options = new Options();
options.addOption(
new DefaultOptionBuilder()
.withLongOpt("drop-tables")
.withDescription("drop and recreate all tables")
.create("d"));
EnvBuilder.addStandardOptions(options);
CommandLineParser parser = new PosixParser();
CommandLine cmd;
try {
cmd = parser.parse(options, args);
} catch (ParseException e) {
System.err.println( "Invalid option usage: " + e.getMessage());
new HelpFormatter().printHelp("DumpLoader", options);
return;
}
Env env = new EnvBuilder(cmd).build();
Configurator conf = env.getConfigurator();
List<ParserVisitor> visitors = new ArrayList<ParserVisitor>();
RawPageDao rpDao = conf.get(RawPageDao.class);
LocalPageDao lpDao = conf.get(LocalPageDao.class);
LocalLinkDao llDao = conf.get(LocalLinkDao.class);
LocalCategoryMemberDao lcmDao = conf.get(LocalCategoryMemberDao.class);
InterLanguageLinkDao illDao = conf.get(InterLanguageLinkDao.class);
MetaInfoDao metaDao = conf.get(MetaInfoDao.class);
LocalLinkVisitor linkVisitor = new LocalLinkVisitor(llDao, lpDao, metaDao);
ParserVisitor catVisitor = new LocalCategoryVisitor(lpDao, lcmDao, metaDao);
ParserVisitor illVisitor = new InterLanguageLinkVisitor(illDao, lpDao, metaDao);
visitors.add(linkVisitor);
visitors.add(catVisitor);
visitors.add(illVisitor);
if(cmd.hasOption("d")) {
llDao.clear();
lcmDao.clear();
illDao.clear();
metaDao.clear(LocalLink.class);
metaDao.clear(LocalCategoryMember.class);
metaDao.clear(InterLanguageLink.class);
}
illDao.beginLoad();
llDao.beginLoad();
lcmDao.beginLoad();
metaDao.beginLoad();
for (Language lang : env.getLanguages().getLanguages()) {
LOG.info("loading links for " + lang);
final LocalLinkSet linkSet = new LocalLinkSet();
linkVisitor.setLinkListener(
new LocalLinkVisitor.Listener() {
public void notify(LocalLink link) { linkSet.addLink(link); }
});
WikiTextDumpParser dumpParser = new WikiTextDumpParser(
rpDao, LanguageInfo.getByLanguage(lang), LanguageSet.ALL);
dumpParser.parse(visitors);
linkSet.finish();
List<File> paths = env.getFiles(lang, FileMatcher.LINK_SQL);
if (paths.size() > 1) {
throw new IllegalStateException();
}
if (paths.size() == 1) {
SqlLinksLoader sqlLoader = new SqlLinksLoader(llDao, lpDao, metaDao, paths.get(0), linkSet);
sqlLoader.load();
}
}
illDao.endLoad();
llDao.endLoad();
lcmDao.endLoad();
metaDao.endLoad();
System.out.println("encountered " + metaDao.getInfo(LocalLink.class).getNumErrors() + " parse errors");
// Why is this necessary???
// It seems like things die without it :(
System.exit(0);
}
}