package org.wikibrain.parser.wiki;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.DaoFilter;
import org.wikibrain.core.dao.RawPageDao;
import org.wikibrain.core.lang.LanguageInfo;
import org.wikibrain.core.lang.LanguageSet;
import org.wikibrain.core.model.RawPage;
import org.wikibrain.utils.ParallelForEach;
import org.wikibrain.utils.Procedure;
import org.wikibrain.utils.WpThreadUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*/
public class
WikiTextDumpParser {
public static final Logger LOG = LoggerFactory.getLogger(WikiTextDumpParser.class);
// maximum number of raw pages in the parsing buffer
public static final int MAX_QUEUE = 1000;
private final LanguageInfo language;
private final RawPageDao rawPageDao;
private final LanguageSet allowedLanguages;
private int maxThreads = WpThreadUtils.getMaxThreads();
public WikiTextDumpParser(RawPageDao rawPageDao, LanguageInfo language) {
this(rawPageDao, language, null);
}
public WikiTextDumpParser(RawPageDao rawPageDao, LanguageInfo language, LanguageSet allowedIllLangs) {
this.language = language;
this.allowedLanguages = allowedIllLangs;
this.rawPageDao = rawPageDao;
}
public void setMaxThreads(int maxThreads) {
this.maxThreads = maxThreads;
}
/**
* Parses the input file completely. First splits the file into individual PageXmls via
* DumpPageXmlParser, then parses each page via WikiTextParser
*
* @param visitor extracts data from side effects
*/
public void parse(ParserVisitor visitor) throws DaoException {
parse(Arrays.asList(visitor));
}
public synchronized void parse(List<ParserVisitor> visitors) throws DaoException {
DaoFilter daoFilter = new DaoFilter().setLanguages(language.getLanguage());
ParallelForEach.iterate(
rawPageDao.get(daoFilter).iterator(),
maxThreads,
MAX_QUEUE,
new ParserProcedure(visitors),
10000
);
}
class ParserProcedure implements Procedure<RawPage> {
private final ThreadLocal<WikiTextParser> parserHolder = new ThreadLocal<WikiTextParser>();
private final List<ParserVisitor> visitors;
ParserProcedure(List<ParserVisitor> visitors) {
this.visitors = visitors;
}
@Override
public void call(RawPage rp) {
if (rp == null) {
return;
}
WikiTextParser parser = parserHolder.get();
if (parser == null) {
parser = new WikiTextParser(language, allowedLanguages, visitors);
parserHolder.set(parser);
}
try {
parser.parse(rp);
} catch (Exception e) {
String title = "unknown";
LOG.warn("exception while parsing " + title, e);
}
}
}
}