package org.wikibrain.parser.xml; import org.wikibrain.core.lang.LanguageInfo; import org.wikibrain.core.model.RawPage; import org.wikibrain.parser.DumpSplitter; import org.wikibrain.parser.WpParseException; import java.io.File; import java.util.Iterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class DumpPageXmlParser implements Iterable<RawPage> { public static final Logger LOG = LoggerFactory.getLogger(DumpSplitter.class); private final PageXmlParser parser; DumpSplitter impl; LanguageInfo language; /** * @param file * @param language TODO: read language from dump file! */ public DumpPageXmlParser(File file, LanguageInfo language) { this.language = language; this.impl = new DumpSplitter(file); this.parser = new PageXmlParser(language); } @Override public Iterator<RawPage> iterator() { return new IteratorImpl(); } public class IteratorImpl implements Iterator<RawPage> { private final Iterator<String> iterImpl; private RawPage buff; public IteratorImpl() { this.iterImpl = impl.iterator(); } @Override public boolean hasNext() { if (buff == null) { fillBuff(); } return (buff != null); } private void fillBuff() { if (buff != null) { return; } // try to queue up the next article while (buff == null && iterImpl.hasNext()) { try { buff = parser.parse(iterImpl.next()); } catch (WpParseException e) { LOG.warn("parsing of " + impl.getPath() + " failed:", e); } } } @Override public RawPage next() { fillBuff(); RawPage next = buff; buff = null; return next; } @Override public void remove() { throw new UnsupportedOperationException(); } } }