package no.priv.garshol.duke; /** * The default implementation of the Property interface. */ public class PropertyImpl implements Property { private String name; private boolean id; private boolean analyzed; // irrelevant if ID private boolean ignore; // irrelevant if ID private Comparator comparator; // irrelevant if ID private double high; // irrelevant if ID private double low; // irrelevant if ID private Lookup lookup; // irrelevant if ID // used to initialize ID properties public PropertyImpl(String name) { this.name = name; this.id = true; this.analyzed = false; this.lookup = Lookup.FALSE; } public PropertyImpl(String name, Comparator comparator, double low, double high) { this.name = name; this.id = false; this.analyzed = comparator != null && comparator.isTokenized(); this.comparator = comparator; this.high = high; this.low = low; this.lookup = Lookup.DEFAULT; } // FIXME: rules for property names? public String getName() { return name; } public boolean isIdProperty() { return id; } public boolean isAnalyzedProperty() { return analyzed; } public Comparator getComparator() { return comparator; } public double getHighProbability() { return high; } public double getLowProbability() { return low; } public Lookup getLookupBehaviour() { return lookup; } /** * Sets the comparator used for this property. Note that changing * this while Duke is processing may have unpredictable * consequences. */ public void setComparator(Comparator comparator) { this.comparator = comparator; } /** * Sets the high probability used for this property. Note that * changing this while Duke is processing may have unpredictable * consequences. */ public void setHighProbability(double high) { this.high = high; } /** * Sets the low probability used for this property. Note that * changing this while Duke is processing may have unpredictable * consequences. */ public void setLowProbability(double low) { this.low = low; } /** * Iff true the property should not be used for comparing records. */ public boolean isIgnoreProperty() { // some people set high probability to zero, which means these // properties will prevent any matches from occurring at all if // we try to use them. so we skip these. return ignore || high == 0.0; } /** * Makes Duke skip this property when comparing records. */ public void setIgnoreProperty(boolean ignore) { this.ignore = ignore; } /** * Sets the lookup behaviour of this property. */ public void setLookupBehaviour(Lookup lookup) { this.lookup = lookup; } /** * Returns the probability that the records v1 and v2 came from * represent the same entity, based on high and low probability * settings etc. */ public double compare(String v1, String v2) { // FIXME: it should be possible here to say that, actually, we // didn't learn anything from comparing these two values, so that // probability is set to 0.5. if (comparator == null) return 0.5; // we ignore properties with no comparator // first, we call the comparator, to get a measure of how similar // these two values are. note that this is not the same as what we // are going to return, which is a probability. double sim = comparator.compare(v1, v2); // we have been configured with a high probability (for equal // values) and a low probability (for different values). given // sim, which is a measure of the similarity somewhere in between // equal and different, we now compute our estimate of the // probability. // if sim = 1.0, we return high. if sim = 0.0, we return low. for // values in between we need to compute a little. the obvious // formula to use would be (sim * (high - low)) + low, which // spreads the values out equally spaced between high and low. // however, if the similarity is higher than 0.5 we don't want to // consider this negative evidence, and so there's a threshold // there. also, users felt Duke was too eager to merge records, // and wanted probabilities to fall off faster with lower // probabilities, and so we square sim in order to achieve this. if (sim >= 0.5) return ((high - 0.5) * (sim * sim)) + 0.5; else return low; } public Property copy() { if (id) return new PropertyImpl(name); PropertyImpl p = new PropertyImpl(name, comparator, low, high); p.setIgnoreProperty(ignore); p.setLookupBehaviour(lookup); return p; } public String toString() { return "[Property " + name + "]"; } }