package org.wikibrain.parser; import org.apache.commons.compress.archivers.ArchiveException; import org.wikibrain.utils.WpIOUtils; import javax.xml.stream.XMLStreamException; import java.io.*; import java.util.Iterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Iterates over a file containing an XML dump of wikipedia. * Each string is the contents of a single article. * Iterators are independent, so multiple iterators can simultaneously open a dump file. */ public class DumpSplitter implements Iterable<String> { public static final String ARTICLE_BEGIN = "<page>"; public static final String ARTICLE_END = "</page>"; private static final int MAX_ARTICLE_LENGTH = 10000000; // Maximum length of article private static final Logger LOG = LoggerFactory.getLogger(DumpSplitter.class); private File path; /** * Creates an iterator over the given file. * The file can be gzipped or bzipped. * @param path */ public DumpSplitter(File path) { this.path = path; } public File getPath() { return path; } @Override public Iterator<String> iterator() { try { return new ArticleIterator(path); } catch (IOException e) { LOG.error("article iterator construction failed", e); throw new RuntimeException(e); } catch (ArchiveException e) { LOG.error("article iterator construction failed", e); throw new RuntimeException(e); } catch (XMLStreamException e) { LOG.error("article iterator construction failed", e); throw new RuntimeException(e); } } public class ArticleIterator implements Iterator<String> { private BufferedReader reader; private String buffer = null; private int lineNum = 0; private boolean closed = false; public ArticleIterator(File path) throws IOException, ArchiveException, XMLStreamException { reader = WpIOUtils.openBufferedReader(path); } private void fillBuffer() { if (closed || buffer != null) { return; } try { String articleOpen = readToArticleBegin(); if (articleOpen == null) { return; } buffer = readToArticleClose(articleOpen); } catch (IOException e) { logParseError("parser failed", e); e.printStackTrace(); } } /** * Reads until it finds the beginning of an article. * @return the line with the beginning tag. * @throws IOException */ private String readToArticleBegin() throws IOException { while (true) { String line = readLine(); if (line == null) { return null; } if (line.trim().equals(ARTICLE_BEGIN)) { return line + "\n"; } } } /** * Reads until the end of the article. * If the article is too long, it truncates the article and adds a closing tag. * @param articleOpen First line of the article. * @return */ private String readToArticleClose(String articleOpen) throws IOException { StringBuffer buffer = new StringBuffer(articleOpen); while (true) { String line = readLine(); if (line == null) { logParseError("reached eof in middle of article"); buffer.append(ARTICLE_END + "\n"); break; } if (buffer.length() + line.length() > MAX_ARTICLE_LENGTH) { logParseError("truncating overly long article"); buffer.append(ARTICLE_END + "\n"); break; } buffer.append(line + "\n"); if (line.trim().equals(ARTICLE_END)) { break; } } return buffer.toString(); } private void logParseError(String message) { LOG.error("parsing " + path + " failed in line " + message); } private void logParseError(String message, Exception e) { LOG.error("parsing " + path + " failed in line " + message); } private String readLine() throws IOException { if (closed) { return null; } String line = reader.readLine(); if (line == null) { reader.close(); closed = true; return null; } lineNum++; return line; } @Override public boolean hasNext() { fillBuffer(); return (buffer != null); } public String next() { fillBuffer(); String tmp = buffer; buffer = null; return tmp; } @Override public void remove() { throw new UnsupportedOperationException(); } } }