package org.wikibrain.wikidata; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; import org.apache.commons.lang3.StringEscapeUtils; import org.wikibrain.core.lang.Language; import org.wikibrain.core.lang.LanguageInfo; import org.wikibrain.core.lang.LanguageSet; import org.wikibrain.core.model.RawPage; import org.wikibrain.parser.DumpSplitter; import org.wikibrain.parser.xml.PageXmlParser; import org.wikibrain.utils.WpIOUtils; import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.Iterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * @author Shilad Sen */ public class WikidataDumpParser implements Iterable<WikidataEntity> { public static final Logger LOG = LoggerFactory.getLogger(DumpSplitter.class); private final WikidataParser wdParser; private final LanguageSet languages; private final File file; public WikidataDumpParser(File file) { this(file, LanguageSet.ALL); } /** * @param file */ public WikidataDumpParser(File file, LanguageSet languages) { this.file = file; this.languages = languages; this.wdParser = new WikidataParser(); } @Override public Iterator<WikidataEntity> iterator() { return new IteratorImpl(); } public class IteratorImpl implements Iterator<WikidataEntity> { private final Iterator<String> iterImpl; private WikidataEntity buff; public IteratorImpl() { try { this.iterImpl = new LineIterator(WpIOUtils.openBufferedReader(file)); } catch (IOException e) { throw new IllegalArgumentException(e); } } @Override public boolean hasNext() { if (buff == null) { fillBuff(); } return (buff != null); } private void fillBuff() { if (buff != null) { return; } // try to queue up the next article while (buff == null && iterImpl.hasNext()) { String line = iterImpl.next(); if (line.trim().equals("[") || line.trim().equals("]")) { continue; } try { if (line.endsWith(",")) { line = line.substring(0, line.length()-1); } if (!line.trim().isEmpty()) { buff = wdParser.parse(line); } } catch (Exception e) { LOG.warn("parsing of " + file + " failed for line '" + line + "':", e); } } } @Override public WikidataEntity next() { fillBuff(); WikidataEntity next = buff; buff = null; return next; } @Override public void remove() { throw new UnsupportedOperationException(); } } }