/**
* MediawikiImporter
* Copyright 2008 by Michael Peter Christen
* First released 20.11.2008 at http://yacy.net
*
* This is a part of YaCy, a peer-to-peer based web search engine
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.importer;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Array;
import java.net.MalformedURLException;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.ByteBuffer;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.NumberTools;
import net.yacy.data.wiki.WikiCode;
import net.yacy.data.wiki.WikiParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.content.SurrogateReader;
import net.yacy.kelondro.util.NamePrefixThreadFactory;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
/*
* this class provides data structures to read a mediawiki dump file in xml format
* as referenced with xmlns="http://www.mediawiki.org/xml/export-0.3/"
*/
public class MediawikiImporter extends Thread implements Importer {
private static final String textstart = "<text";
private static final String textend = "</text>";
private static final String pagestart = "<page>";
private static final String pageend = "</page>";
private static final byte[] pagestartb = UTF8.getBytes(pagestart);
private static final byte[] pageendb = UTF8.getBytes(pageend);
private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 mediawiki dump
public static Importer job; // if started from a servlet, this object is used to store the thread
public MultiProtocolURL sourcefile;
public File targetdir;
public int count;
private long start;
private final long docsize;
private final int approxdocs;
private String hostport, urlStub;
private String errorMessage;
public MediawikiImporter(final MultiProtocolURL sourcefile, final File targetdir) {
super("MediawikiImporter(" + sourcefile != null ? sourcefile.toNormalform(true) : "null sourcefile" +")");
this.sourcefile = sourcefile;
this.docsize = sourcefile.length();
this.approxdocs = (int) (this.docsize * docspermbinxmlbz2 / 1024L / 1024L);
this.targetdir = targetdir;
this.count = 0;
this.start = 0;
this.hostport = null;
this.urlStub = null;
this.errorMessage = null;
}
@Override
public int count() {
return this.count;
}
@Override
public String source() {
return this.sourcefile.toNormalform(true);
}
/**
* @return an empty string or the error message when an exception occurred
*/
@Override
public String status() {
return this.errorMessage != null ? this.errorMessage : "";
}
/**
* @return the number of articles per second
*/
@Override
public int speed() {
if (this.count == 0) return 0;
return (int) (this.count / Math.max(1L, runningTime() ));
}
/**
* @return the remaining seconds for the completion of all records in milliseconds
*/
@Override
public long remainingTime() {
return Math.max(0, this.approxdocs - this.count) / Math.max(1, speed() );
}
@Override
public long runningTime() {
return (System.currentTimeMillis() - this.start) / 1000L;
}
@Override
public void run() {
this.start = System.currentTimeMillis();
final int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
// out keeps a outputfile open until poisened, to make sure underlaying thread gets the end condition
// regardless of any exception (e.g. eof memory) a add(poison) is added to the most outer final block
final BlockingQueue<wikiparserrecord> out = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
final wikiparserrecord poison = newRecord();
BufferedReader reader = null;
try {
String targetstub = this.sourcefile.getFileName();
int p = targetstub.lastIndexOf("\\.");
if (p > 0) targetstub = targetstub.substring(0, p);
InputStream is = new BufferedInputStream(this.sourcefile.getInputStream(ClientIdentification.yacyInternetCrawlerAgent), 1024 * 1024);
if (this.sourcefile.getFileName().endsWith(".bz2")) {
is = new BZip2CompressorInputStream(is);
} else if (this.sourcefile.getFileName().endsWith(".gz")) {
is = new GZIPInputStream(is);
}
reader = new BufferedReader(new java.io.InputStreamReader(is, StandardCharsets.UTF_8), 4 * 1024 * 1024);
String t;
StringBuilder sb = new StringBuilder();
boolean page = false, text = false;
String title = null;
final BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
final ExecutorService service = Executors.newCachedThreadPool(
new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".convertConsumer"));
final convertConsumer[] consumers = new convertConsumer[threads];
final Future<?>[] consumerResults = (Future<?>[]) Array.newInstance(Future.class, threads);
for (int i = 0; i < threads; i++) {
consumers[i] = new convertConsumer(in, out, poison);
consumerResults[i] = service.submit(consumers[i]);
}
final convertWriter writer = new convertWriter(out, poison, this.targetdir, targetstub);
final Future<Integer> writerResult = service.submit(writer);
wikiparserrecord record;
int q;
while ((t = reader.readLine()) != null) {
if ((p = t.indexOf("<base>",0)) >= 0 && (q = t.indexOf("</base>", p)) > 0) {
//urlStub = "http://" + lang + ".wikipedia.org/wiki/";
this.urlStub = t.substring(p + 6, q);
if (!this.urlStub.endsWith("/")) {
q = this.urlStub.lastIndexOf('/');
if (q > 0) this.urlStub = this.urlStub.substring(0, q + 1);
}
final DigestURL uri = new DigestURL(this.urlStub);
this.hostport = uri.getHost();
if (uri.getPort() != 80) this.hostport += ":" + uri.getPort();
continue;
}
if (t.indexOf(pagestart) >= 0) {
page = true;
continue;
}
if ((p = t.indexOf(textstart)) >= 0) {
text = page;
q = t.indexOf('>', p + textstart.length());
if (q > 0) {
final int u = t.indexOf(textend, q + 1);
if (u > q) {
sb.append(t.substring(q + 1, u));
ConcurrentLog.info("WIKITRANSLATION", "[INJECT] Title: " + title);
if (sb.length() == 0) {
ConcurrentLog.info("WIKITRANSLATION", "ERROR: " + title + " has empty content");
continue;
}
record = newRecord(this.hostport, this.urlStub, title, sb);
try {
in.put(record);
this.count++;
} catch (final InterruptedException e1) {
ConcurrentLog.logException(e1);
}
sb = new StringBuilder(200);
continue;
}
sb.append(t.substring(q + 1));
}
continue;
}
if (t.indexOf(textend) >= 0) {
text = false;
ConcurrentLog.info("WIKITRANSLATION", "[INJECT] Title: " + title);
if (sb.length() == 0) {
ConcurrentLog.info("WIKITRANSLATION", "ERROR: " + title + " has empty content");
continue;
}
record = newRecord(this.hostport, this.urlStub, title, sb);
try {
in.put(record);
this.count++;
} catch (final InterruptedException e1) {
ConcurrentLog.logException(e1);
}
sb = new StringBuilder(200);
continue;
}
if (t.indexOf(pageend) >= 0) {
page = false;
continue;
}
if ((p = t.indexOf("<title>",0)) >= 0) {
title = t.substring(p + 7);
q = title.indexOf("</title>",0);
if (q >= 0) title = title.substring(0, q);
continue;
}
if (text) {
sb.append(t);
sb.append('\n');
}
}
try {
for (int i = 0; i < threads; i++) {
in.put(poison);
}
for (int i = 0; i < threads; i++) {
consumerResults[i].get(10000, TimeUnit.MILLISECONDS);
}
} catch (final Exception e) {
this.errorMessage = e.getMessage();
ConcurrentLog.logException(e);
} finally {
out.put(poison); // output thread condition (for file.close)
writerResult.get(10000, TimeUnit.MILLISECONDS);
}
} catch (final Exception e) {
this.errorMessage = e.getMessage();
ConcurrentLog.logException(e);
} finally {
if(reader != null) {
try {
reader.close();
} catch (IOException e) {
ConcurrentLog.warn("WIKITRANSLATION", "Could not close dump reader : " + e.getMessage());
}
}
try {
out.put(poison); // out keeps output file open until poisened, to close file if exception happend in this block
} catch (InterruptedException ex) { }
}
}
public static void checkIndex(final File mediawikixml) {
final File idx = idxFromMediawikiXML(mediawikixml);
if (idx.exists()) return;
new indexMaker(mediawikixml).start();
}
public static class indexMaker extends Thread {
File mediawikixml;
public indexMaker(final File mediawikixml) {
super("MediawikiImporter.indexMaker " + mediawikixml != null ? mediawikixml.getName() : "");
this.mediawikixml = mediawikixml;
}
@Override
public void run() {
try {
createIndex(this.mediawikixml);
} catch (final IOException e) {
} catch (final Exception e) {
ConcurrentLog.logException(e);
}
}
}
public static File idxFromMediawikiXML(final File mediawikixml) {
return new File(mediawikixml.getAbsolutePath() + ".idx.xml");
}
public static void createIndex(final File dumpFile) throws IOException {
// calculate md5
//String md5 = serverCodings.encodeMD5Hex(dumpFile);
// init reader, producer and consumer
final PositionAwareReader in = new PositionAwareReader(dumpFile);
final indexProducer producer = new indexProducer(100, idxFromMediawikiXML(dumpFile));
final wikiConsumer consumer = new wikiConsumer(100, producer);
final ExecutorService service = Executors.newCachedThreadPool(
new NamePrefixThreadFactory(MediawikiImporter.class.getSimpleName() + ".createIndex"));
final Future<Integer> producerResult = service.submit(consumer);
final Future<Integer> consumerResult = service.submit(producer);
service.shutdown();
// read the wiki dump
long start, stop;
while (in.seek(pagestartb)) {
start = in.pos() - 6;
in.resetBuffer();
if (!in.seek(pageendb)) break;
stop = in.pos();
consumer.consume(new wikiraw(in.bytes(), start, stop));
in.resetBuffer();
}
// shut down the services
try {
consumer.consume(wikiConsumer.poison);
try {consumerResult.get(5000, TimeUnit.MILLISECONDS);} catch (final TimeoutException e) {}
producer.consume(indexProducer.poison);
if (!consumerResult.isDone()) consumerResult.get();
producerResult.get();
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
return;
} catch (final ExecutionException e) {
ConcurrentLog.logException(e);
return;
}
in.close();
}
private static class indexProducer implements Callable<Integer> {
private final BlockingQueue<wikisourcerecord> entries;
PrintWriter out;
protected static wikisourcerecord poison = new wikisourcerecord("", 0, 0);
int count;
public indexProducer(final int bufferCount, final File indexFile) throws IOException {
this.entries = new ArrayBlockingQueue<wikisourcerecord>(bufferCount);
this.out = new PrintWriter(new BufferedWriter(new FileWriter(indexFile)));
this.count = 0;
this.out.println("<index>");
}
public void consume(final wikisourcerecord b) {
try {
this.entries.put(b);
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
}
}
@Override
public Integer call() {
wikisourcerecord r;
try {
while(true) {
r = this.entries.take();
if (r == poison) {
ConcurrentLog.info("WIKITRANSLATION", "producer / got poison");
break;
}
this.out.println(" <page start=\"" + r.start + "\" length=\"" + (r.end - r.start) + "\">");
this.out.println(" <title>" + r.title + "</title>");
this.out.println(" </page>");
ConcurrentLog.info("WIKITRANSLATION", "producer / record start: " + r.start + ", title : " + r.title);
this.count++;
}
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
}
this.entries.clear();
this.out.println("</index>");
this.out.close();
return Integer.valueOf(this.count);
}
}
private static class wikiConsumer implements Callable<Integer> {
private final BlockingQueue<wikiraw> entries;
protected static wikiraw poison = new wikiraw(new byte[0], 0, 0);
private final indexProducer producer;
private int count;
public wikiConsumer(final int bufferCount, final indexProducer producer) {
this.entries = new ArrayBlockingQueue<wikiraw>(bufferCount);
this.producer = producer;
this.count = 0;
}
public void consume(final wikiraw b) {
try {
this.entries.put(b);
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
}
}
@Override
public Integer call() {
wikisourcerecord r;
wikiraw c;
try {
while(true) {
c = this.entries.take();
if (c == poison) {
ConcurrentLog.info("WIKITRANSLATION", "consumer / got poison");
break;
}
try {
r = new wikisourcerecord(c.b, c.start, c.end);
this.producer.consume(r);
ConcurrentLog.info("WIKITRANSLATION", "consumer / record start: " + r.start + ", title : " + r.title);
this.count++;
} catch (final RuntimeException e) {}
}
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
}
this.entries.clear();
return Integer.valueOf(this.count);
}
}
private static class wikiraw {
public long start, end;
public byte[] b;
public wikiraw(final byte[] b, final long start, final long end) {
this.b = b;
this.start = start;
this.end = end;
}
}
public static class wikisourcerecord {
public long start, end;
public String title;
public wikisourcerecord(final String title, final long start, final long end) {
this.title = title;
this.start = start;
this.end = end;
}
public wikisourcerecord(final byte[] chunk, final long start, final long end) {
String s;
s = UTF8.String(chunk);
final int t0 = s.indexOf("<title>",0);
if (t0 >= 0) {
final int t1 = s.indexOf("</title>", t0);
if (t1 >= 0) {
this.title = s.substring(t0 + 7, t1);
} else {
throw new RuntimeException("no title end in record");
}
} else {
throw new RuntimeException("no title start in record");
}
this.start = start;
this.end = end;
}
}
public wikiparserrecord newRecord() {
return new wikiparserrecord(null, null, null, null);
}
public wikiparserrecord newRecord(final String hostport, final String urlStub, final String title, final StringBuilder sb) {
return new wikiparserrecord(hostport, urlStub, title, sb);
}
public class wikiparserrecord {
public String title;
String source, html, hostport, urlStub;
AnchorURL url;
Document document;
public wikiparserrecord(final String hostport, final String urlStub, final String title, final StringBuilder sb) {
this.title = title;
this.hostport = hostport;
this.urlStub = urlStub;
this.source = (sb == null) ? null : sb.toString();
}
public void genHTML() throws IOException {
try {
final WikiParser wparser = new WikiCode();
this.html = wparser.transform(this.hostport, this.source);
} catch (final Exception e) {
ConcurrentLog.logException(e);
throw new IOException(e.getMessage());
}
}
public void genDocument() throws Parser.Failure {
try {
this.url = new AnchorURL(this.urlStub + this.title);
final Document[] parsed = TextParser.parseSource(this.url, "text/html", StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, 1, UTF8.getBytes(this.html));
this.document = Document.mergeDocuments(this.url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here
this.document.setTitle(this.title);
} catch (final MalformedURLException e1) {
ConcurrentLog.logException(e1);
}
}
public void writeXML(final OutputStreamWriter os) throws IOException {
this.document.writeXML(os);
}
}
private static class PositionAwareReader {
private final InputStream is;
private long seekpos;
private ByteBuffer bb;
public PositionAwareReader(final File dumpFile) throws FileNotFoundException {
this.is = new BufferedInputStream(new FileInputStream(dumpFile), 64 *1024);
this.seekpos = 0;
this.bb = new ByteBuffer();
}
public void resetBuffer() {
if (this.bb.length() > 10 * 1024) this.bb = new ByteBuffer(); else this.bb.clear();
}
public boolean seek(final byte[] pattern) throws IOException {
int pp = 0;
int c;
while ((c = this.is.read()) >= 0) {
this.seekpos++;
this.bb.append(c);
if (pattern[pp] == c) pp++; else pp = 0;
if (pp == pattern.length) return true;
}
return false;
}
public long pos() {
return this.seekpos;
}
public byte[] bytes() {
return this.bb.getBytes();
}
public synchronized void close() {
try {
this.is.close();
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
}
}
public static byte[] read(final File f, final long start, final int len) {
final byte[] b = new byte[len];
RandomAccessFile raf = null;
try {
raf = new RandomAccessFile(f, "r");
raf.seek(start);
raf.read(b);
} catch (final IOException e) {
ConcurrentLog.logException(e);
return null;
} finally {
if (raf != null) try {
raf.close();
try{raf.getChannel().close();} catch (final IOException e) {}
} catch (final IOException e) { }
}
return b;
}
public static wikisourcerecord find(final String title, final File f) throws IOException {
final PositionAwareReader in = new PositionAwareReader(f);
long start;
final String m = "<title>" + title + "</title>";
String s;
while (in.seek(UTF8.getBytes("<page "))) {
start = in.pos() - 6;
in.resetBuffer();
if (!in.seek(pageendb)) break;
s = UTF8.String(in.bytes());
in.resetBuffer();
if (s.indexOf(m) >= 0) {
// we found the record
//Log.logInfo("WIKITRANSLATION", "s = " + s);
int p = s.indexOf("start=\"",0);
if (p < 0) return null;
p += 7;
int q = s.indexOf('"', p + 1);
if (q < 0) return null;
start = NumberTools.parseLongDecSubstring(s, p, q);
p = s.indexOf("length=\"", q);
if (p < 0) return null;
p += 8;
q = s.indexOf('"', p + 1);
if (q < 0) return null;
final int length = NumberTools.parseIntDecSubstring(s, p, q);
//Log.logInfo("WIKITRANSLATION", "start = " + start + ", length = " + length);
return new wikisourcerecord(title, start, start + length);
}
}
return null;
}
private static class convertConsumer implements Callable<Integer> {
private final BlockingQueue<wikiparserrecord> in, out;
private final wikiparserrecord poison;
public convertConsumer(final BlockingQueue<wikiparserrecord> in, final BlockingQueue<wikiparserrecord> out, final wikiparserrecord poison) {
this.poison = poison;
this.in = in;
this.out = out;
}
@Override
public Integer call() {
wikiparserrecord record;
try {
while(true) {
record = this.in.take();
if (record == this.poison) {
ConcurrentLog.info("WIKITRANSLATION", "convertConsumer / got poison");
break;
}
try {
record.genHTML();
record.genDocument();
this.out.put(record);
} catch (final RuntimeException e) {
ConcurrentLog.logException(e);
} catch (final Parser.Failure e) {
ConcurrentLog.logException(e);
} catch (final IOException e) {
// TODO Auto-generated catch block
ConcurrentLog.logException(e);
}
}
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
}
ConcurrentLog.info("WIKITRANSLATION", "*** convertConsumer has terminated");
return Integer.valueOf(0);
}
}
private static class convertWriter implements Callable<Integer> {
private final BlockingQueue<wikiparserrecord> in;
private final wikiparserrecord poison;
private OutputStreamWriter osw;
private final String targetstub;
private final File targetdir;
private int fc, rc;
private String outputfilename;
public convertWriter(
final BlockingQueue<wikiparserrecord> in,
final wikiparserrecord poison,
final File targetdir,
final String targetstub) {
this.poison = poison;
this.in = in;
this.osw = null;
this.targetdir = targetdir;
this.targetstub = targetstub;
this.fc = 0;
this.rc = 0;
this.outputfilename = null;
}
@Override
public Integer call() {
wikiparserrecord record;
try {
while(true) {
record = this.in.take();
if (record == this.poison) {
ConcurrentLog.info("WIKITRANSLATION", "convertConsumer / got poison");
break;
}
if (this.osw == null) {
// start writing a new file
this.outputfilename = this.targetstub + "." + this.fc + ".xml.prt";
this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(this.targetdir, this.outputfilename))), StandardCharsets.UTF_8);
this.osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n");
}
ConcurrentLog.info("WIKITRANSLATION", "[CONSUME] Title: " + record.title);
record.document.writeXML(this.osw);
this.rc++;
if (this.rc >= 10000) {
this.osw.write(SurrogateReader.SURROGATES_MAIN_ELEMENT_CLOSE + "\n");
this.osw.close();
final String finalfilename = this.targetstub + "." + this.fc + ".xml";
new File(this.targetdir, this.outputfilename).renameTo(new File(this.targetdir, finalfilename));
this.rc = 0;
this.fc++;
this.outputfilename = this.targetstub + "." + this.fc + ".xml.prt";
this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(this.targetdir, this.outputfilename))), StandardCharsets.UTF_8);
this.osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + SurrogateReader.SURROGATES_MAIN_ELEMENT_OPEN + "\n");
}
}
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
} catch (final UnsupportedEncodingException e) {
ConcurrentLog.logException(e);
} catch (final FileNotFoundException e) {
ConcurrentLog.logException(e);
} catch (final IOException e) {
ConcurrentLog.logException(e);
} finally {
try {
if (osw != null) { // maybe null on poison (immediately)
this.osw.write(SurrogateReader.SURROGATES_MAIN_ELEMENT_CLOSE + "\n");
this.osw.close();
final String finalfilename = this.targetstub + "." + this.fc + ".xml";
new File(this.targetdir, this.outputfilename).renameTo(new File(this.targetdir, finalfilename));
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
}
ConcurrentLog.info("WIKITRANSLATION", "*** convertWriter has terminated");
return Integer.valueOf(0);
}
}
public static void main(final String[] s) {
if (s.length == 0) {
System.out.println("usage:");
System.out.println(" -index <wikipedia-dump>");
System.out.println(" -read <start> <len> <idx-file>");
System.out.println(" -find <title> <wikipedia-dump>");
System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>");
ConcurrentLog.shutdown();
return;
}
try {
// example:
// java -Xmx2000m -cp classes:lib/bzip2.jar
// de.anomic.tools.mediawikiIndex -convert
// DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2
// DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/
if (s[0].equals("-convert")) {
if(s.length < 3) {
System.out.println("usage:");
System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>");
ConcurrentLog.shutdown();
return;
}
final File targetdir = new File(s[2]);
try {
final MediawikiImporter mi = new MediawikiImporter(new MultiProtocolURL(s[1]), targetdir);
mi.start();
mi.join();
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
} catch (MalformedURLException e) {
ConcurrentLog.logException(e);
}
}
if (s[0].equals("-index")) {
try {
createIndex(new File(s[1]));
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
}
if (s[0].equals("-read")) {
final long start = Integer.parseInt(s[1]);
final int len = Integer.parseInt(s[2]);
System.out.println(UTF8.String(read(new File(s[3]), start, len)));
}
if (s[0].equals("-find")) {
try {
final wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml"));
if (w == null) {
ConcurrentLog.info("WIKITRANSLATION", "not found");
} else {
System.out.println(UTF8.String(read(new File(s[2]), w.start, (int) (w.end - w.start))));
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
}
} finally {
try {
HTTPClient.closeConnectionManager();
} catch (InterruptedException e) {
e.printStackTrace();
}
ConcurrentLog.shutdown();
}
}
}