package no.priv.garshol.duke; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import no.priv.garshol.duke.utils.Utils; /** * Holds the configuration details for a dataset. */ public class ConfigurationImpl implements Configuration { // there are two modes: deduplication and record linkage. in // deduplication mode all sources are in 'datasources'. in record // linkage mode they are in 'group1' and 'group2'. couldn't think // of a better solution. sorry. private Collection<DataSource> datasources; private Collection<DataSource> group1; private Collection<DataSource> group2; private double threshold; private double thresholdMaybe; private Map<String, Property> properties; private List<Property> proplist; // duplicate to preserve order private Collection<Property> lookups; // subset of properties private Database database1; private Database database2; // used for record linkage, if necessary private List<Comparator> customComparators; public ConfigurationImpl() { this.datasources = new ArrayList(); this.group1 = new ArrayList(); this.group2 = new ArrayList(); this.customComparators = new ArrayList<Comparator>(); } /** * Returns the data sources to use (in deduplication mode; don't use * this method in record linkage mode). */ public Collection<DataSource> getDataSources() { return datasources; } /** * Returns the data sources belonging to a particular group of data * sources. Data sources are grouped in record linkage mode, but not * in deduplication mode, so only use this method in record linkage * mode. */ public Collection<DataSource> getDataSources(int groupno) { if (groupno == 1) return group1; else if (groupno == 2) return group2; else throw new DukeConfigException("Invalid group number: " + groupno); } /** * Adds a data source to the configuration. If in deduplication mode * groupno == 0, otherwise it gives the number of the group to which * the data source belongs. */ public void addDataSource(int groupno, DataSource datasource) { // the loader takes care of validation if (groupno == 0) datasources.add(datasource); else if (groupno == 1) group1.add(datasource); else if (groupno == 2) group2.add(datasource); } public Database getDatabase(boolean overwrite) { return getDatabase(1, overwrite); } public Database getDatabase(int groupno, boolean overwrite) { Database thedb; if (groupno == 1) { if (database1 == null) // not set, so use default with is in memory database1 = new no.priv.garshol.duke.databases.InMemoryDatabase(); thedb = database1; } else if (groupno == 2) thedb = database2; // no default for no 2 else throw new DukeException("Can only have two databases"); if (thedb != null) { thedb.setConfiguration(this); thedb.setOverwrite(overwrite); // hmmm? } return thedb; } public void addDatabase(Database database) { if (database1 == null) database1 = database; else if (database2 == null) database2 = database; else throw new DukeConfigException("Too many database objects configured"); } /** * The probability threshold used to decide whether two records * represent the same entity. If the probability is higher than this * value, the two records are considered to represent the same * entity. */ public double getThreshold() { return threshold; } /** * Sets the probability threshold for considering two records * equivalent. */ public void setThreshold(double threshold) { this.threshold = threshold; } /** * The probability threshold used to decide whether two records may * represent the same entity. If the probability is higher than this * value, the two records are considered possible matches. Can be 0, * in which case no records are considered possible matches. */ public double getMaybeThreshold() { return thresholdMaybe; } /** * Returns true iff we are in deduplication mode. */ public boolean isDeduplicationMode() { return !getDataSources().isEmpty(); } /** * Sets the probability threshold for considering two records * possibly equivalent. Does not have to be set. */ public void setMaybeThreshold(double thresholdMaybe) { this.thresholdMaybe = thresholdMaybe; } /** * The set of properties Duke is to work with. */ public void setProperties(List<Property> props) { this.proplist = props; this.properties = new HashMap(props.size()); for (Property prop : props) properties.put(prop.getName(), prop); // analyze properties to find lookup set findLookupProperties(); } /** * The set of properties Duke records can have, and their associated * cleaners, comparators, and probabilities. */ public List<Property> getProperties() { return proplist; } /** * The properties which are used to identify records, rather than * compare them. */ public Collection<Property> getIdentityProperties() { Collection<Property> ids = new ArrayList(); for (Property p : getProperties()) if (p.isIdProperty()) ids.add(p); return ids; } /** * Returns the property with the given name, or null if there is no * such property. */ public Property getPropertyByName(String name) { return properties.get(name); } /** * Returns the properties Duke queries for in the Lucene index. This * is a subset of getProperties(), and is computed based on the * probabilities and the threshold. */ public Collection<Property> getLookupProperties() { return lookups; } /** * Validates the configuration to verify that it makes sense. * Rejects configurations that will fail during runtime. */ public void validate() { // verify that we do have properties if (properties == null || properties.isEmpty()) throw new DukeConfigException("Configuration has no properties at all"); // check if max prob is below threshold // this code duplicates code in findLookupProperties(), but prefer // that to creating an attribute double prob = 0.5; for (Property prop : properties.values()) { if (prop.getHighProbability() == 0.0) // if the probability is zero we ignore the property entirely continue; prob = Utils.computeBayes(prob, prop.getHighProbability()); } if (prob < threshold) throw new DukeConfigException("Maximum possible probability is " + prob + ", which is below threshold (" + threshold + "), which means no duplicates will ever " + "be found"); // check that we have at least one ID property if (getIdentityProperties().isEmpty()) throw new DukeConfigException("No ID properties."); // check that we only have one ID property if (getIdentityProperties().size() > 1) throw new DukeConfigException("Can only have one ID property."); } private void findLookupProperties() { List<Property> candidates = new ArrayList(); for (Property prop : properties.values()) // leave out properties that are either not used for comparisons, // or which have lookup turned off explicitly if (!prop.isIdProperty() && !prop.isIgnoreProperty() && prop.getLookupBehaviour() != Property.Lookup.FALSE && prop.getHighProbability() != 0.0) candidates.add(prop); // sort them, lowest high prob to highest high prob Collections.sort(candidates, new HighComparator()); // run over and find all those needed to get above the threshold int last = -1; double prob = 0.5; for (int ix = 0; ix < candidates.size(); ix++) { Property prop = candidates.get(ix); prob = Utils.computeBayes(prob, prop.getHighProbability()); if (prob >= threshold) { last = ix; break; } } if (last == -1) lookups = new ArrayList(); else lookups = new ArrayList(candidates.subList(0, last + 1)); // need to also add TRUE and REQUIRED for (Property p : proplist) { if (p.getLookupBehaviour() != Property.Lookup.TRUE && p.getLookupBehaviour() != Property.Lookup.REQUIRED) continue; if (lookups.contains(p)) continue; lookups.add(p); } } private static class HighComparator implements java.util.Comparator<Property> { public int compare(Property p1, Property p2) { if (p1.getHighProbability() < p2.getHighProbability()) return 1; else if (p1.getHighProbability() == p2.getHighProbability()) return 0; else return -1; } } public Configuration copy() { ConfigurationImpl copy = new ConfigurationImpl(); for (DataSource src : datasources) copy.addDataSource(0, src); for (DataSource src : group1) copy.addDataSource(1, src); for (DataSource src : group2) copy.addDataSource(2, src); copy.setThreshold(threshold); copy.setMaybeThreshold(thresholdMaybe); copy.addDatabase(database1); if (database2 != null) copy.addDatabase(database2); List<Property> newprops = new ArrayList(); for (Property p : proplist) newprops.add(p.copy()); copy.setProperties(newprops); return copy; } @Override public List<Comparator> getCustomComparators() { return this.customComparators; } @Override public void addCustomComparator(Comparator comparator) { this.customComparators.add(comparator); } }