package no.priv.garshol.duke; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import no.priv.garshol.duke.cleaners.ChainedCleaner; import no.priv.garshol.duke.datasources.Column; import no.priv.garshol.duke.datasources.ColumnarDataSource; import no.priv.garshol.duke.utils.ObjectUtils; import no.priv.garshol.duke.utils.StringUtils; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; /** * Can read XML configuration files and return a fully set up configuration. */ public class ConfigLoader { /** * Note that if file starts with 'classpath:' the resource is looked * up on the classpath instead. */ public static Configuration load(String file) throws IOException, SAXException { ConfigurationImpl cfg = new ConfigurationImpl(); XMLReader parser = XMLReaderFactory.createXMLReader(); parser.setContentHandler(new ConfigHandler(cfg, file)); if (file.startsWith("classpath:")) { String resource = file.substring("classpath:".length()); ClassLoader cloader = Thread.currentThread().getContextClassLoader(); InputStream istream = cloader.getResourceAsStream(resource); parser.parse(new InputSource(istream)); } else parser.parse(file); return cfg; } /** * Loads the configuration XML from the given string. * @since 1.3 */ public static Configuration loadFromString(String config) throws IOException, SAXException { ConfigurationImpl cfg = new ConfigurationImpl(); XMLReader parser = XMLReaderFactory.createXMLReader(); parser.setContentHandler(new ConfigHandler(cfg, null)); Reader reader = new StringReader(config); parser.parse(new InputSource(reader)); return cfg; } private static class ConfigHandler extends DefaultHandler { private ConfigurationImpl config; private List<Property> properties; private List<Comparator> customComparators; private File path; // location of config file private double low; private double high; private String name; private boolean idprop; private boolean ignore_prop; private Comparator comparator; private Property.Lookup lookup; private Set<String> keepers; private int groupno; // counts datasource groups private Map<String, Object> objects; // configured Java beans for reuse private DataSource datasource; private Object currentobj; // Java bean currently being configured by <param> private Database database; private boolean keep; private StringBuffer content; private ConfigHandler(ConfigurationImpl config, String path) { this.config = config; this.properties = new ArrayList<Property>(); if (path != null && !path.startsWith("classpath:")) this.path = new File(path).getParentFile(); this.objects = new HashMap(); this.keepers = new HashSet(); this.content = new StringBuffer(); keepers.add("threshold"); keepers.add("maybe-threshold"); keepers.add("name"); keepers.add("low"); keepers.add("high"); keepers.add("comparator"); } public void startElement(String uri, String localName, String qName, Attributes attributes) { if (keepers.contains(localName)) { keep = true; content.setLength(0); // clear } else if (localName.equals("property")) { String type = attributes.getValue("type"); idprop = type != null && type.equals("id"); ignore_prop = type != null && type.equals("ignore"); low = 0.5; high = 0.5; comparator = null; lookup = Property.Lookup.DEFAULT; if (attributes.getValue("lookup") != null) lookup = (Property.Lookup) ObjectUtils.getEnumConstantByName( Property.Lookup.class, attributes.getValue("lookup").toUpperCase()); } else if (localName.equals("csv")) { datasource = (DataSource) instantiate("no.priv.garshol.duke.datasources.CSVDataSource"); currentobj = datasource; } else if (localName.equals("jdbc")) { datasource = (DataSource) instantiate("no.priv.garshol.duke.datasources.JDBCDataSource"); currentobj = datasource; } else if (localName.equals("jndi")) { datasource = (DataSource) instantiate("no.priv.garshol.duke.datasources.JNDIDataSource"); currentobj = datasource; } else if (localName.equals("sparql")) { datasource = (DataSource) instantiate("no.priv.garshol.duke.datasources.SparqlDataSource"); currentobj = datasource; } else if (localName.equals("ntriples")) { datasource = (DataSource) instantiate("no.priv.garshol.duke.datasources.NTriplesDataSource"); currentobj = datasource; } else if (localName.equals("data-source")) { datasource = (DataSource) instantiate(attributes.getValue("class")); currentobj = datasource; } else if (localName.equals("column")) { if (!(datasource instanceof ColumnarDataSource)) throw new DukeConfigException("Column inside data source which " + "does not support it: " + datasource); String name = attributes.getValue("name"); if (name == null) throw new DukeConfigException("Column with no name"); String property = attributes.getValue("property"); String prefix = attributes.getValue("prefix"); String cleanername = attributes.getValue("cleaner"); Cleaner cleaner = makeCleaner(cleanername); Column c = new Column(name, property, prefix, cleaner); String spliton = attributes.getValue("split-on"); if (spliton != null) c.setSplitOn(spliton); ((ColumnarDataSource) datasource).addColumn(c); } else if (localName.equals("param")) { String param = attributes.getValue("name"); String value = attributes.getValue("value"); if (currentobj == null) throw new DukeConfigException("Trying to set parameter " + param + " but no current object"); // we resolve file references relative to the config file location if (param.equals("input-file") && path != null && !value.startsWith("/")) value = new File(path, value).getAbsolutePath(); ObjectUtils.setBeanProperty(currentobj, param, value, objects); } else if (localName.equals("group")) { groupno++; // FIXME: now possible to have data sources between the two // groups. need to check for that, too. ideally XML // validation should take care of all this for us. if (groupno == 1 && !config.getDataSources().isEmpty()) throw new DukeConfigException("Cannot have groups in deduplication mode"); else if (groupno == 3) throw new DukeConfigException("Record linkage mode only supports " + "two groups"); } else if (localName.equals("object")) { String klass = attributes.getValue("class"); String name = attributes.getValue("name"); currentobj = instantiate(klass); objects.put(name, currentobj); } else if (localName.equals("database")) { String klass = attributes.getValue("class"); if (klass == null) klass = "no.priv.garshol.duke.databases.InMemoryDatabase"; // default database = (Database) instantiate(klass); currentobj = database; } } public void characters(char[] ch, int start, int length) { if (keep) content.append(ch, start, length); } public void endElement(String uri, String localName, String qName) { if (localName.equals("threshold")) config.setThreshold(Double.parseDouble(content.toString())); else if (localName.equals("maybe-threshold")) config.setMaybeThreshold(Double.parseDouble(content.toString())); else if (localName.equals("name")) name = content.toString(); else if (localName.equals("property")) { if (idprop) properties.add(new PropertyImpl(name)); else { Property p = new PropertyImpl(name, comparator, low, high); if (ignore_prop) p.setIgnoreProperty(true); p.setLookupBehaviour(lookup); properties.add(p); } } else if (localName.equals("low")) low = Double.parseDouble(content.toString()); else if (localName.equals("high")) high = Double.parseDouble(content.toString()); else if (localName.equals("comparator")) { comparator = (Comparator) objects.get(content.toString()); if (comparator == null) // wasn't a configured bean comparator = (Comparator) instantiate(content.toString()); } else if (localName.equals("csv") || localName.equals("jdbc") || localName.equals("jndi") || localName.equals("ntriples") || localName.equals("sparql") || localName.equals("data-source")) { config.addDataSource(groupno, datasource); datasource = null; currentobj = null; } else if (localName.equals("object")) { if (currentobj instanceof Comparator) // store custom comparators so genetic algorithm can get them config.addCustomComparator((Comparator) currentobj); currentobj = null; } else if (localName.equals("database")) config.addDatabase(database); if (keepers.contains(localName)) keep = false; else if (localName.equals("duke")) { if (groupno > 0 && groupno != 2) throw new DukeConfigException("Record linkage mode requires exactly 2 groups; should you be using deduplication mode?"); } } public void endDocument() { config.setProperties(properties); } private Cleaner makeCleaner(String value) { if (value == null) return null; String[] names = StringUtils.split(value); Cleaner[] cleaners = new Cleaner[names.length]; for (int ix = 0; ix < cleaners.length; ix++) cleaners[ix] = _makeCleaner(names[ix]); if (cleaners.length == 1) return cleaners[0]; else return new ChainedCleaner(cleaners); } private Cleaner _makeCleaner(String name) { Cleaner cleaner = (Cleaner) objects.get(name); if (cleaner == null) // wasn't a configured bean cleaner = (Cleaner) instantiate(name); return cleaner; } } private static Object instantiate(String classname) { try { Class klass = Class.forName(classname); return klass.newInstance(); } catch (Exception e) { throw new DukeConfigException("Couldn't instantiate class " + classname + ": " + e); } } }