package no.priv.garshol.duke;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.util.List;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Properties;
import no.priv.garshol.duke.matchers.AbstractMatchListener;
import no.priv.garshol.duke.matchers.PrintMatchListener;
import no.priv.garshol.duke.matchers.TestFileListener;
import no.priv.garshol.duke.utils.YesNoConsole;
import no.priv.garshol.duke.utils.LinkFileWriter;
import no.priv.garshol.duke.utils.NTriplesWriter;
import no.priv.garshol.duke.utils.LinkDatabaseUtils;
import no.priv.garshol.duke.utils.CommandLineParser;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
/**
* Command-line interface to the engine.
*/
public class Duke {
private static Properties properties;
public static void main(String[] argv) throws IOException {
try {
main_(argv);
} catch (DukeConfigException e) {
System.err.println("ERROR: " + e.getMessage());
}
}
public static void main_(String[] argv) throws IOException {
// parse command-line
CommandLineParser parser = setupParser();
try {
argv = parser.parse(argv);
} catch (CommandLineParser.CommandLineParserException e) {
System.err.println("ERROR: " + e.getMessage());
usage();
System.exit(1);
}
// set up some initial options
boolean datadebug = parser.getOptionState("showdata");
Logger logger = new CommandLineLogger(parser.getOptionState("verbose") ?
1 : 0);
boolean progress = parser.getOptionState("progress");
int count = 0;
int batch_size = parser.getOptionInteger("batchsize", 40000);
int threads = parser.getOptionInteger("threads", 1);
// load the configuration
Configuration config;
try {
config = ConfigLoader.load(argv[0]);
} catch (FileNotFoundException e) {
System.err.println("ERROR: Config file '" + argv[0] + "' not found!");
return;
} catch (SAXParseException e) {
System.err.println("ERROR: Couldn't parse config file: " + e.getMessage());
System.err.println("Error in " + e.getSystemId() + ":" +
e.getLineNumber() + ":" + e.getColumnNumber());
return;
} catch (SAXException e) {
System.err.println("ERROR: Couldn't parse config file: " + e.getMessage());
return;
}
// validate the configuration
if (!datadebug) // unless --showdata
config.validate();
// if we're in data debug mode we branch out here
if (datadebug) {
showdata(config);
return; // stop here
}
// set up listeners
boolean noreindex = parser.getOptionState("noreindex");
Processor processor = new Processor(config, !noreindex);
processor.setLogger(logger);
processor.setThreads(threads);
// sanity check
if (noreindex && processor.getDatabase().isInMemory()) {
System.out.println("Option --noreindex not available with in-memory " +
"database");
return;
}
// display lookup properties?
if (parser.getOptionState("lookups")) {
System.out.println("Lookup properties:");
for (Property p : config.getLookupProperties())
System.out.println(" " + p.getName());
System.out.println();
}
boolean interactive = parser.getOptionState("interactive");
boolean pretty = parser.getOptionState("pretty") || interactive;
boolean showmatches = parser.getOptionState("showmatches") || interactive;
PrintMatchListener listener =
new PrintMatchListener(showmatches,
parser.getOptionState("showmaybe"),
progress,
!config.isDeduplicationMode(),
config.getProperties(),
pretty);
processor.addMatchListener(listener);
// needs to be before the link file handler, in case the link file
// is the same as the test file
TestFileListener testfile = null;
if (parser.getOptionValue("testfile") != null) {
testfile = new TestFileListener(parser.getOptionValue("testfile"),
config,
parser.getOptionState("testdebug"),
processor,
showmatches,
pretty);
testfile.setPessimistic(true);
processor.addMatchListener(testfile);
if (testfile.isEmpty())
System.out.println("WARN: Test file is empty. Did you mean --linkfile?");
}
AbstractLinkFileListener linkfile = null;
if (parser.getOptionValue("linkfile") != null) {
String fname = parser.getOptionValue("linkfile");
if (fname.endsWith(".ntriples"))
linkfile = new NTriplesLinkFileListener(fname, config.getIdentityProperties());
else
linkfile = new LinkFileListener(fname, config.getIdentityProperties(),
interactive,
parser.getOptionValue("testfile"));
processor.addMatchListener(linkfile);
}
// --profile
if (parser.getOptionState("profile"))
processor.setPerformanceProfiling(true);
// --singlematch setting
boolean matchall = true;
if (parser.getOptionState("singlematch")) {
if (config.isDeduplicationMode())
throw new DukeConfigException("--singlematch only works in record linkage mode");
matchall = false;
}
// this is where we get started for real. the first thing we do
// is to distinguish between modes.
if (config.isDeduplicationMode())
// deduplication mode
processor.deduplicate(config.getDataSources(), batch_size);
else {
// record linkage mode
if (noreindex) {
// user has specified that they already have group 1 indexed up,
// and don't want to do it again, for whatever reason. in that
// case we just do the linking, and don't touch group 1 at all.
processor.linkRecords(config.getDataSources(2), matchall);
} else
processor.link(config.getDataSources(1),
config.getDataSources(2),
matchall,
batch_size);
}
// close up shop, then finish
if (parser.getOptionValue("linkfile") != null)
linkfile.close();
processor.close();
}
private static void showdata(Configuration config) {
List<Property> props = config.getProperties();
List<DataSource> sources = new ArrayList();
sources.addAll(config.getDataSources());
sources.addAll(config.getDataSources(1));
sources.addAll(config.getDataSources(2));
for (DataSource src : sources) {
RecordIterator it = src.getRecords();
while (it.hasNext()) {
Record r = it.next();
PrintMatchListener.prettyPrint(r, props);
System.out.println("");
}
it.close();
}
}
private static void usage() {
System.out.println("");
System.out.println("java no.priv.garshol.duke.Duke [options] <cfgfile>");
System.out.println("");
System.out.println(" --progress show progress report while running");
System.out.println(" --showmatches show matches while running");
System.out.println(" --linkfile=<file> output matches to link file");
System.out.println(" --interactive query user before outputting link file matches");
System.out.println(" --testfile=<file> test matches against known correct results in file");
System.out.println(" --testdebug display failures");
System.out.println(" --verbose display diagnostics");
System.out.println(" --noreindex reuse existing Lucene index");
System.out.println(" --batchsize=n set size of Lucene indexing batches");
System.out.println(" --showdata show all cleaned data (data debug mode)");
System.out.println(" --profile display performance statistics");
System.out.println(" --threads=N run processing in N parallell threads");
System.out.println(" --pretty pretty display when comparing records");
System.out.println(" --singlematch (in record linkage mode) only accept");
System.out.println(" the best match for each record");
System.out.println(" --lookups display lookup properties");
System.out.println("");
System.out.println("Duke version " + getVersionString());
}
private static CommandLineParser setupParser() {
CommandLineParser parser = new CommandLineParser();
parser.setMinimumArguments(1);
parser.setMaximumArguments(1);
parser.addBooleanOption("progress", 'p');
parser.addStringOption("linkfile", 'l');
parser.addStringOption("linkendpoint", 'e');
parser.addBooleanOption("showmatches", 's');
parser.addBooleanOption("showmaybe", 'm');
parser.addStringOption("testfile", 'T');
parser.addBooleanOption("testdebug", 't');
parser.addStringOption("batchsize", 'b');
parser.addBooleanOption("verbose", 'v');
parser.addStringOption("threads", 'P');
parser.addBooleanOption("noreindex", 'N');
parser.addBooleanOption("interactive", 'I');
parser.addBooleanOption("showdata", 'D');
parser.addBooleanOption("profile", 'o');
parser.addStringOption("threads", 'n');
parser.addBooleanOption("pretty", 'n');
parser.addBooleanOption("singlematch", 'n');
parser.addBooleanOption("lookups", 'L');
return parser;
}
public static String getVersionString() {
Properties props = getProperties();
return props.getProperty("duke.version") + ", build " +
props.getProperty("duke.build") + ", built by " +
props.getProperty("duke.builder");
}
public static String getVersion() {
return getProperties().getProperty("duke.version");
}
private static Properties getProperties() {
if (properties == null) {
properties = new Properties();
try {
InputStream in = Duke.class.getClassLoader().getResourceAsStream("no/priv/garshol/duke/duke.properties");
properties.load(in);
in.close();
} catch (IOException e) {
throw new DukeException("Couldn't load duke.properties", e);
}
}
return properties;
}
static abstract class AbstractLinkFileListener extends AbstractMatchListener {
private Collection<Property> idprops;
public AbstractLinkFileListener(Collection<Property> idprops) {
this.idprops = idprops;
}
public void close() throws IOException {
}
public abstract void link(String id1, String id2, double confidence)
throws IOException;
public void matches(Record r1, Record r2, double confidence) {
try {
for (Property p : idprops)
for (String id1 : r1.getValues(p.getName()))
for (String id2 : r2.getValues(p.getName()))
link(id1, id2, confidence);
} catch (IOException e) {
throw new DukeException(e);
}
}
}
static class LinkFileListener extends AbstractLinkFileListener {
private Writer out;
private LinkFileWriter writer;
private LinkDatabase linkdb;
private YesNoConsole console;
public LinkFileListener(String linkfile, Collection<Property> idprops,
boolean interactive, String testfile)
throws IOException {
super(idprops);
if (interactive) {
this.console = new YesNoConsole();
this.linkdb = new InMemoryLinkDatabase();
if (testfile != null)
linkdb = LinkDatabaseUtils.loadTestFile(testfile);
}
// have to start writing the link file *after* we load the test
// file, because they may be the same file...
// second param: if there is a test file, we append to the link
// file, instead of overwriting
this.out = new FileWriter(linkfile, testfile != null);
this.writer = new LinkFileWriter(out);
// FIXME: this will only work if the two files are the same
}
public void link(String id1, String id2, double confidence)
throws IOException {
boolean correct = true;
// does this provide new information, or do we know it already?
Link inferredlink = null;
if (linkdb != null)
inferredlink = linkdb.inferLink(id1, id2);
// record it
if (console != null) {
if (inferredlink == null) {
correct = console.yesorno();
confidence = 1.0; // the user told us, which is as certain as it gets
} else {
correct = inferredlink.getKind() == LinkKind.SAME;
confidence = inferredlink.getConfidence();
}
}
// note that we also write inferred links, because the test file
// listener does not do inference
writer.write(id1, id2, correct, confidence);
out.flush(); // make sure we preserve the data
if (linkdb != null && inferredlink == null) {
Link link = new Link(id1, id2, LinkStatus.ASSERTED,
correct ? LinkKind.SAME : LinkKind.DIFFERENT, 1.0);
linkdb.assertLink(link);
}
}
public void close() throws IOException {
out.close();
}
}
static class NTriplesLinkFileListener extends AbstractLinkFileListener {
private FileOutputStream fos;
private NTriplesWriter out;
public NTriplesLinkFileListener(String linkfile,
Collection<Property> idprops)
throws IOException {
super(idprops);
this.fos = new FileOutputStream(linkfile);
this.out = new NTriplesWriter(fos);
}
public void link(String id1, String id2, double confidence)
throws IOException {
out.statement(id1, "http://www.w3.org/2002/07/owl#sameAs", id2, false);
}
public void close() throws IOException {
out.done();
fos.close();
}
}
static class CommandLineLogger implements Logger {
private int loglevel; // 1: trace, 2: debug, 3: info, 4: warn, 5: error
private CommandLineLogger(int loglevel) {
this.loglevel = loglevel;
}
public void trace(String msg) {
if (isTraceEnabled())
System.out.println(msg);
}
public void debug(String msg) {
if (isDebugEnabled())
System.out.println(msg);
}
public void info(String msg) {
if (isInfoEnabled())
System.out.println(msg);
}
public void warn(String msg) {
warn(msg, null);
}
public void warn(String msg, Throwable e) {
if (!isWarnEnabled())
return;
System.out.println(msg + " " + e);
e.printStackTrace();
}
public void error(String msg) {
error(msg, null);
}
public void error(String msg, Throwable e) {
if (!isErrorEnabled())
return;
System.out.println(msg + " " + e);
e.printStackTrace();
}
public boolean isTraceEnabled() {
return loglevel == 1;
}
public boolean isDebugEnabled() {
return loglevel != 0 && loglevel < 3;
}
public boolean isInfoEnabled() {
return loglevel != 0 && loglevel < 4;
}
public boolean isWarnEnabled() {
return loglevel != 0 && loglevel < 5;
}
public boolean isErrorEnabled() {
return loglevel != 0 && loglevel < 6;
}
}
}