package no.priv.garshol.duke.matchers;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import no.priv.garshol.duke.Configuration;
import no.priv.garshol.duke.DukeException;
import no.priv.garshol.duke.Link;
import no.priv.garshol.duke.LinkDatabase;
import no.priv.garshol.duke.LinkKind;
import no.priv.garshol.duke.LinkStatus;
import no.priv.garshol.duke.Property;
import no.priv.garshol.duke.Record;
/**
* Maintains a LinkDatabase of the recorded matches. Assumes that the
* same record may be processed several times (for example in
* different runs), and will keep the database correctly updated.
*
* <p><b>WARNING:</b> This class is not thread-safe, so attempting to
* use it with multiple threads will lead to database corruption.
*/
public class LinkDatabaseMatchListener extends AbstractMatchListener {
private Configuration config;
private LinkDatabase linkdb;
private Record current;
private Collection<Link> curlinks;
public LinkDatabaseMatchListener(Configuration config, LinkDatabase linkdb) {
this.config = config;
this.linkdb = linkdb;
}
// the only callbacks we get are matches(), matchesPerhaps(), and
// noMatchFor(). from these, we need to work out when Duke starts
// on a new record, and call startRecord_() and endRecord_()
// accordingly.
public void matches(Record r1, Record r2, double confidence) {
if (r1 != current) {
// we've finished processing the previous record, so make the calls
if (current != null)
endRecord_();
startRecord_(r1);
}
String id1 = getIdentity(r1);
String id2 = getIdentity(r2);
curlinks.add(new Link(id1, id2, LinkStatus.INFERRED, LinkKind.SAME,
confidence));
}
public void matchesPerhaps(Record r1, Record r2, double confidence) {
if (r1 != current) {
// we've finished processing the previous record, so make the calls
if (current != null)
endRecord_();
startRecord_(r1);
}
String id1 = getIdentity(r1);
String id2 = getIdentity(r2);
curlinks.add(new Link(id1, id2, LinkStatus.INFERRED, LinkKind.MAYBESAME,
confidence));
}
public void noMatchFor(Record record) {
// this is the only call we'll get for this record. it means the
// previous record has ended, and this one has begun (and will end
// with the next call, whatever it is)
if (current != null)
endRecord_();
startRecord_(record);
// next callback will trigger endRecord_()
}
// this method is called from the event methods
public void startRecord_(Record r) {
current = r;
curlinks = new ArrayList();
}
// this method is called from the event methods
public void endRecord_() {
// this is where we actually update the link database. basically,
// all we need to do is to retract those links which weren't seen
// this time around, and that can be done via assertLink, since it
// can override existing links.
// get all the existing links
Collection<Link> oldlinks = linkdb.getAllLinksFor(getIdentity(current));
// build a hashmap so we can look up corresponding old links from
// new links
if (oldlinks != null) {
Map<String, Link> oldmap = new HashMap(oldlinks.size());
for (Link l : oldlinks)
oldmap.put(makeKey(l), l);
// removing all the links we found this time around from the set of
// old links. any links remaining after this will be stale, and need
// to be retracted
for (Link newl : new ArrayList<Link>(curlinks)) {
String key = makeKey(newl);
Link oldl = oldmap.get(key);
if (oldl == null)
continue;
if (oldl.overrides(newl))
// previous information overrides this link, so ignore
curlinks.remove(newl);
else if (sameAs(oldl, newl)) {
// there's no new information here, so just ignore this
curlinks.remove(newl);
oldmap.remove(key); // we don't want to retract the old one
} else
// the link is out of date, but will be overwritten, so remove
oldmap.remove(key);
}
// all the inferred links left in oldmap are now old links we
// didn't find on this pass. there is no longer any evidence
// supporting them, and so we can retract them.
for (Link oldl : oldmap.values())
if (oldl.getStatus() == LinkStatus.INFERRED) {
oldl.retract(); // changes to retracted, updates timestamp
curlinks.add(oldl);
}
}
// okay, now we write it all to the database
for (Link l : curlinks)
linkdb.assertLink(l);
}
public void batchReady(int size) {
linkdb.validateConnection();
}
public void batchDone() {
// clearly, this is the end of the previous record
endRecord_();
current = null;
linkdb.commit();
}
private String getIdentity(Record r) {
for (Property p : config.getIdentityProperties()) {
Collection<String> vs = r.getValues(p.getName());
if (vs == null)
continue;
for (String v : vs)
return v;
}
throw new DukeException("No identity found in record [" +
PrintMatchListener.toString(r) + "]");
}
private String makeKey(Link l) {
return l.getID1() + "\t" + l.getID2();
}
private boolean sameAs(Link l1, Link l2) {
// we know the IDs are the same, so we're not going to check those
return l1.getStatus() == l2.getStatus() &&
l1.getKind() == l2.getKind();
// confidence and timestamp are irrelevant
}
}