Processor.java example

Explorer

Duke-master
- duke-core
  - src
    - main
      - java
        no
        priv
        garshol
        duke
        AbstractCmdlineTool.java
        Cleaner.java
        CompactRecord.java
        Comparator.java
        ConfigLoader.java
        ConfigWriter.java
        Configuration.java
        ConfigurationImpl.java
        DataSource.java
        Database.java
        DatabaseStatistics.java
        DebugCompare.java
        Duke.java
        DukeConfigException.java
        DukeException.java
        DummyLogger.java
        EquivalenceClassDatabase.java
        InMemoryClassDatabase.java
        InMemoryLinkDatabase.java
        JDBCEquivalenceClassDatabase.java
        JDBCLinkDatabase.java
        JNDILinkDatabase.java
        Link.java
        LinkDatabase.java
        LinkKind.java
        LinkSource.java
        LinkStatus.java
        Logger.java
        ModifiableRecord.java
        Processor.java
        Property.java
        PropertyImpl.java
        RDBMSLinkDatabase.java
        Record.java
        RecordImpl.java
        RecordIterator.java
        RecordSearch.java
        StatementHandler.java
        cleaners
        AbstractRuleBasedCleaner.java
        ChainedCleaner.java
        DigitsOnlyCleaner.java
        FamilyCommaGivenCleaner.java
        GenericValueCleaner.java
        HTMLCleaner.java
        LowerCaseNormalizeCleaner.java
        MappingFileCleaner.java
        NorwegianAddressCleaner.java
        NorwegianCompanyNameCleaner.java
        PersonNameCleaner.java
        PhoneNumberCleaner.java
        RegexpCleaner.java
        StripNontextCharacters.java
        Transform.java
        TrimCleaner.java
        comparators
        DiceCoefficientComparator.java
        DifferentComparator.java
        ExactComparator.java
        GeopositionComparator.java
        JaccardIndexComparator.java
        JaroWinkler.java
        JaroWinklerTokenized.java
        Levenshtein.java
        LongestCommonSubstring.java
        Matcher.java
        MetaphoneComparator.java
        NorphoneComparator.java
        NumericComparator.java
        PersonNameComparator.java
        QGramComparator.java
        SoundexComparator.java
        WeightedLevenshtein.java
        databases
        AbstractBlockingDatabase.java
        AbstractKeyFunction.java
        Bucket.java
        InMemoryBlockingDatabase.java
        InMemoryDatabase.java
        InMemoryKeyValueStore.java
        KeyFunction.java
        KeyValueDatabase.java
        KeyValueStore.java
        datasources
        CSVDataSource.java
        Column.java
        ColumnarDataSource.java
        InMemoryDataSource.java
        JDBCDataSource.java
        JNDIDataSource.java
        NTriplesDataSource.java
        RecordBuilder.java
        SparqlDataSource.java
        examples
        CapitalCleaner.java
        CountryNameCleaner.java
        genetic
        Aspect.java
        ComparatorAspect.java
        ConsoleOracle.java
        Driver.java
        ExemplarsTracker.java
        FloatAspect.java
        GeneticAlgorithm.java
        GeneticConfiguration.java
        GeneticPopulation.java
        HighProbabilityAspect.java
        LinkFileOracle.java
        LowProbabilityAspect.java
        Oracle.java
        Pair.java
        ThresholdAspect.java
        matchers
        AbstractMatchListener.java
        ClassDatabaseMatchListener.java
        LinkDatabaseMatchListener.java
        MatchListener.java
        PrintMatchListener.java
        TestFileListener.java
        utils
        CSVReader.java
        CommandLineParser.java
        DefaultRecordIterator.java
        JDBCUtils.java
        LinkDatabaseUtils.java
        LinkFileWriter.java
        NTriplesParser.java
        NTriplesWriter.java
        ObjectUtils.java
        PropertyUtils.java
        SparqlClient.java
        SparqlResult.java
        StringUtils.java
        TestFileUtils.java
        Utils.java
        XMLPrettyPrinter.java
        YesNoConsole.java
    - test
      - java
        no
        priv
        garshol
        duke
        CompactRecordTest.java
        JDBCClassDatabaseTest.java
        JDBCLinkDatabaseTest.java
        cleaners
        DigitsOnlyCleanerTest.java
        FamilyCommaGivenCleanerTest.java
        HTMLCleanerTest.java
        LowerCaseNormalizeCleanerTest.java
        NorwegianAddressCleanerTest.java
        NorwegianCompanyNameCleanerTest.java
        PersonNameCleanerTest.java
        PhoneNumberCleanerTest.java
        RegexpCleanerTest.java
        TrimCleanerTest.java
        comparators
        DiceCoefficientComparatorTest.java
        DifferentComparatorTest.java
        GeopositionComparatorTest.java
        JaccardIndexComparatorTest.java
        JaroWinklerTest.java
        LevenshteinTest.java
        LongestCommonSubstringTest.java
        MetaphoneComparatorTest.java
        NorphoneComparatorTest.java
        NumericComparatorTest.java
        PersonNameComparatorTest.java
        QGramComparatorTest.java
        SoundexComparatorTest.java
        WeightedLevenshteinTest.java
        databases
        DatabaseTest.java
        InMemoryBlockingDatabaseTest.java
        InMemoryDatabaseTest.java
        KeyValueDatabaseTest.java
        PersistentDatabaseTest.java
        PriorityQueueTest.java
        datasources
        CSVDataSourceTest.java
        InMemoryDataSourceTest.java
        JDBCDataSourceTest.java
        NTriplesDataSourceTest.java
        SparqlDataSourceTest.java
        genetic
        ActiveLearningTest.java
        ComparatorAspectTest.java
        GeneticConfigurationTest.java
        integration
        IT.java
        matchers
        InMemoryLinkDatabaseMatchListenerTest.java
        LinkDatabaseMatchListenerTest.java
        test
        ClassDatabaseTest.java
        ConfigLoaderTest.java
        ConfigWriterTest.java
        ConfigurationTest.java
        DeduplicatorTest.java
        InMemoryClassDatabaseTest.java
        InMemoryLinkDatabaseTest.java
        RecordImplTest.java
        utils
        CSVReaderTest.java
        CommandLineParserTest.java
        LinkDatabaseUtilsTest.java
        LinkFileWriterTest.java
        NTriplesParserTest.java
        NTriplesWriterTest.java
        ObjectUtilsTest.java
        PropertyUtilsTest.java
        SparqlClientTest.java
        StringUtilsTest.java
        TestUtils.java
- duke-es
  - src
    - main
      - java
        no
        priv
        garshol
        duke
        databases
        es
        ElasticSearchDatabase.java
        StorageType.java
    - test
      - java
        no
        priv
        garshol
        duke
        databases
        es
        DocumentRecordTest.java
        ElasticSearchConfigLoaderTest.java
- duke-json
  - src
    - main
      - java
        no
        priv
        garshol
        duke
        datasources
        JsonDataSource.java
    - test
      - java
        no
        priv
        garshol
        duke
        datasources
        JsonDataSourceTest.java
- duke-lucene
  - src
    - main
      - java
        no
        priv
        garshol
        duke
        databases
        DocumentRecord.java
        GeoProperty.java
        LuceneDatabase.java
    - test
      - java
        no
        priv
        garshol
        duke
        databases
        DocumentRecordTest.java
        ExtraLuceneDatabaseTest.java
        GeoSearchingTest.java
        LuceneConfigLoaderTest.java
        LuceneDatabaseTest.java
        PersistentLuceneDatabaseTest.java
        RecordLinkTest.java
- duke-mapdb
  - src
    - main
      - java
        no
        priv
        garshol
        duke
        databases
        MapDBBlockingDatabase.java
    - test
      - java
        no
        priv
        garshol
        duke
        databases
        MapDBBlockingDatabaseTest.java
        PersistentMapDBBlockingDatabaseTest.java
- duke-mongodb
  - src
    - main
      - java
        no
        priv
        garshol
        duke
        datasources
        MongoDBDataSource.java
- duke-server
  - src
    - main
      - java
        no
        priv
        garshol
        duke
        server
        BasicTimer.java
        CommonJTimer.java
        DukeController.java
        DukeTimer.java
        StatusServlet.java


package no.priv.garshol.duke;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.CopyOnWriteArrayList;
import java.io.Writer;
import java.io.PrintWriter;

import no.priv.garshol.duke.matchers.MatchListener;
import no.priv.garshol.duke.matchers.PrintMatchListener;
import no.priv.garshol.duke.matchers.AbstractMatchListener;
import no.priv.garshol.duke.utils.Utils;
import no.priv.garshol.duke.utils.DefaultRecordIterator;

/**
 * The class that implements the actual deduplication and record
 * linkage logic.
 */
public class Processor {
  private Configuration config;
  private Collection<MatchListener> listeners;
  private Logger logger;
  private List<Property> proporder;
  private double[] accprob;
  private int threads;
  private Database database1;
  private Database database2;
  private final static int DEFAULT_BATCH_SIZE = 40000;

  // performance statistics
  private long comparisons; // number of records compared
  private long srcread; // ms spent reading from data sources
  private long indexing; // ms spent indexing records
  private long searching; // ms spent searching for records
  private long comparing; // ms spent comparing records
  private long callbacks; // ms spent in callbacks
  private Profiler profiler;

  /**
   * Creates a new processor, overwriting the existing Lucene index.
   */
  public Processor(Configuration config) {
    this(config, true);
  }

  /**
   * Creates a new processor.
   * @param overwrite If true, make new Lucene index. If false, leave
   * existing data.
   */
  public Processor(Configuration config, boolean overwrite) {
    this(config, config.getDatabase(1, overwrite));
    database2 = config.getDatabase(2, overwrite);
  }

  /**
   * Creates a new processor, bound to the given database.
   */
  public Processor(Configuration config, Database database) {
    this.config = config;
    this.database1 = database;
    // using this List implementation so that listeners can be removed
    // while Duke is running (see issue 117)
    this.listeners = new CopyOnWriteArrayList<MatchListener>();
    this.logger = new DummyLogger();
    this.threads = 1;

    // precomputing for later optimizations
    this.proporder = new ArrayList();
    for (Property p : config.getProperties())
      if (!p.isIdProperty())
        proporder.add(p);
    Collections.sort(proporder, new PropertyComparator());

    // still precomputing
    double prob = 0.5;
    accprob = new double[proporder.size()];
    for (int ix = proporder.size() - 1; ix >= 0; ix--) {
      prob = Utils.computeBayes(prob, proporder.get(ix).getHighProbability());
      accprob[ix] = prob;
    }
  }

  /**
   * Sets the logger to report to.
   */
  public void setLogger(Logger logger) {
    this.logger = logger;
  }

  /**
   * Sets the number of threads to use for processing. The default is
   * 1.
   */
  public void setThreads(int threads) {
    this.threads = threads;
  }

  /**
   * Returns the number of threads.
   */
  public int getThreads() {
    return threads;
  }

  /**
   * Adds a listener to be notified of processing events.
   */
  public void addMatchListener(MatchListener listener) {
    listeners.add(listener);
  }

  /**
   * Removes a listener from being notified of the processing events.
   * @since 1.1
   */
  public boolean removeMatchListener(MatchListener listener) {
    if (listener != null)
      return listeners.remove(listener);
    return true;
  }

  /**
   * Returns all registered listeners.
   */
  public Collection<MatchListener> getListeners() {
    return listeners;
  }

  /**
   * Returns the Database in which the Duke index is stored. This will
   * be the Lucene index if you are using the Lucene database.
   */
  public Database getDatabase() {
    return database1;
  }

  /**
   * Returns the Database in which the Duke index is stored for the
   * given group in record linkage mode. This will be the Lucene index
   * if you are using the Lucene database.
   * @param group Must be 1 or 2.
   */
  public Database getDatabase(int group) {
    if (group == 1)
      return database1;
    else if (group == 2)
      return database2;
    throw new DukeException("Unknown group " + group);
  }


  /**
   * Used to turn performance profiling on and off.
   * @since 1.1
   */
  public void setPerformanceProfiling(boolean profile) {
    if (profile) {
      if (profiler != null)
        return; // we're already profiling

      this.profiler = new Profiler();
      addMatchListener(profiler);

    } else {
      // turn off profiling
      if (profiler == null)
        return; // we're not profiling, so nothing to do

      removeMatchListener(profiler);
      profiler = null;
    }
  }

  /**
   * Returns the performance profiler, if any.
   * @since 1.1
   */
  public Profiler getProfiler() {
    return profiler;
  }

  /**
   * Reads all available records from the data sources and processes
   * them in batches, notifying the listeners throughout.
   */
  public void deduplicate() {
    deduplicate(config.getDataSources(), DEFAULT_BATCH_SIZE);
  }

  /**
   * Reads all available records from the data sources and processes
   * them in batches, notifying the listeners throughout.
   */
  public void deduplicate(int batch_size) {
    deduplicate(config.getDataSources(), batch_size);
  }

  /**
   * Reads all available records from the data sources and processes
   * them in batches, notifying the listeners throughout.
   */
  public void deduplicate(Collection<DataSource> sources, int batch_size) {
    int count = 0;
    startProcessing();

    Iterator<DataSource> it = sources.iterator();
    while (it.hasNext()) {
      DataSource source = it.next();
      source.setLogger(logger);

      RecordIterator it2 = source.getRecords();
      try {
        Collection<Record> batch = new ArrayList();
        long start = System.currentTimeMillis();
        while (it2.hasNext()) {
          Record record = it2.next();
          batch.add(record);
          count++;
          if (count % batch_size == 0) {
            srcread += (System.currentTimeMillis() - start);
            deduplicate(batch);
            it2.batchProcessed();
            batch = new ArrayList();
            start = System.currentTimeMillis();
          }
        }

        if (!batch.isEmpty()) {
          deduplicate(batch);
          it2.batchProcessed();
        }
      } finally {
        it2.close();
      }
    }

    endProcessing();
  }

  /**
   * Deduplicates a newly arrived batch of records. The records may
   * have been seen before.
   */
  public void deduplicate(Collection<Record> records) {
    logger.info("Deduplicating batch of " + records.size() + " records");
    batchReady(records.size());

    // prepare
    long start = System.currentTimeMillis();
    for (Record record : records)
      database1.index(record);

    database1.commit();
    indexing += System.currentTimeMillis() - start;

    // then match
    match(records, true);

    batchDone();
  }

  private void match(Collection<Record> records, boolean matchall) {
    if (threads == 1)
      for (Record record : records)
        match(1, record, matchall);
    else
      threadedmatch(records, matchall);
  }

  private void threadedmatch(Collection<Record> records, boolean matchall) {
    // split batch into n smaller batches
    MatchThread[] threads = new MatchThread[this.threads];
    for (int ix = 0; ix < threads.length; ix++)
      threads[ix] = new MatchThread(ix, records.size() / threads.length,
                                    matchall);
    int ix = 0;
    for (Record record : records)
      threads[ix++ % threads.length].addRecord(record);

    // kick off threads
    for (ix = 0; ix < threads.length; ix++)
      threads[ix].start();

    // wait for threads to finish
    try {
      for (ix = 0; ix < threads.length; ix++)
        threads[ix].join();
    } catch (InterruptedException e) {
      // argh
    }
  }

  /**
   * Does record linkage across the two groups, but does not link
   * records within each group.
   */
  public void link() {
    link(config.getDataSources(1), config.getDataSources(2),
         DEFAULT_BATCH_SIZE);
  }

  // FIXME: what about the general case, where there are more than 2 groups?
  /**
   * Does record linkage across the two groups, but does not link
   * records within each group. With this method, <em>all</em> matches
   * above threshold are passed on.
   */
  public void link(Collection<DataSource> sources1,
                   Collection<DataSource> sources2,
                   int batch_size) {
    link(sources1, sources2, true, batch_size);
  }

  /**
   * Does record linkage across the two groups, but does not link
   * records within each group.
   * @param matchall If true, all matching records are accepted. If false,
   *                 only the single best match for each record is accepted.
   * @param batch_size The batch size to use.
   * @since 1.1
   */
  public void link(Collection<DataSource> sources1,
                   Collection<DataSource> sources2,
                   boolean matchall,
                   int batch_size) {
    startProcessing();

    // start with source 1
    for (Collection<Record> batch : makeBatches(sources1, batch_size)) {
      index(1, batch);
      if (hasTwoDatabases())
        linkBatch(2, batch, matchall);
    }

    // then source 2
    for (Collection<Record> batch : makeBatches(sources2, batch_size)) {
      if (hasTwoDatabases())
        index(2, batch);
      linkBatch(1, batch, matchall);
    }

    endProcessing();
  }

  /**
   * Retrieve new records from data sources, and match them to
   * previously indexed records. This method does <em>not</em> index
   * the new records. With this method, <em>all</em> matches above
   * threshold are passed on.
   * @since 0.4
   */
  public void linkRecords(Collection<DataSource> sources) {
    linkRecords(sources, true);
  }

  /**
   * Retrieve new records from data sources, and match them to
   * previously indexed records. This method does <em>not</em> index
   * the new records.
   * @param matchall If true, all matching records are accepted. If false,
   *                 only the single best match for each record is accepted.
   * @since 0.5
   */
  public void linkRecords(Collection<DataSource> sources, boolean matchall) {
    linkRecords(sources, matchall, DEFAULT_BATCH_SIZE);
  }

  /**
   * Retrieve new records from data sources, and match them to
   * previously indexed records. This method does <em>not</em> index
   * the new records.
   * @param matchall If true, all matching records are accepted. If false,
   *                 only the single best match for each record is accepted.
   * @param batch_size The batch size to use.
   * @since 1.0
   */
  public void linkRecords(Collection<DataSource> sources, boolean matchall,
                          int batch_size) {
    linkRecords(1, sources, matchall, batch_size);
  }

  /**
   * Retrieve new records from data sources, and match them to
   * previously indexed records in the given database. This method
   * does <em>not</em> index the new records.
   * @param dbno Which database to match against.
   * @param matchall If true, all matching records are accepted. If false,
   *                 only the single best match for each record is accepted.
   * @param batch_size The batch size to use.
   * @since 1.3
   */
  public void linkRecords(int dbno, Collection<DataSource> sources,
                          boolean matchall, int batch_size) {
    for (DataSource source : sources) {
      source.setLogger(logger);

      Collection<Record> batch = new ArrayList(batch_size);
      RecordIterator it = source.getRecords();
      while (it.hasNext()) {
        batch.add(it.next());
        if (batch.size() == batch_size) {
          linkBatch(dbno, batch, matchall);
          batch.clear();
        }
      }
      it.close();

      if (!batch.isEmpty())
        linkBatch(dbno, batch, matchall);
    }

    endProcessing();
  }

  private void linkBatch(int dbno, Collection<Record> batch, boolean matchall) {
    batchReady(batch.size());
    for (Record r : batch)
      match(dbno, r, matchall);
    batchDone();
  }

  /**
   * Index all new records from the given data sources. This method
   * does <em>not</em> do any matching.
   * @since 0.4
   */
  public void index(Collection<DataSource> sources, int batch_size) {
    index(1, sources, batch_size);
  }

  /**
   * Index all new records from the given data sources into the given
   * database. This method does <em>not</em> do any matching.
   * @since 1.3
   */
  public void index(int dbno, Collection<DataSource> sources, int batch_size) {
    Database thedb = getDB(dbno);

    int count = 0;
    for (DataSource source : sources) {
      source.setLogger(logger);

      RecordIterator it2 = source.getRecords();
      while (it2.hasNext()) {
        Record record = it2.next();
        if (logger.isDebugEnabled())
          logger.debug("Indexing record " + record);
        thedb.index(record);
        count++;
        if (count % batch_size == 0)
          batchReady(batch_size);
      }
      it2.close();
    }
    if (count % batch_size == 0)
      batchReady(count % batch_size);
    thedb.commit();
  }

  /**
   * Index the records into the given database. This method does
   * <em>not</em> do any matching.
   * @since 1.3
   */
  public void index(int dbno, Collection<Record> batch) {
    Database thedb = getDB(dbno);

    for (Record r : batch) {
      if (logger.isDebugEnabled())
        logger.debug("Indexing record " + r);
      thedb.index(r);
    }
    thedb.commit();
  }

  /**
   * Returns the number of records that have been compared.
   */
  public long getComparisonCount() {
    return comparisons;
  }

  private void match(int dbno, Record record, boolean matchall) {
    long start = System.currentTimeMillis();
    Collection<Record> candidates = getDB(dbno).findCandidateMatches(record);
    searching += System.currentTimeMillis() - start;
    if (logger.isDebugEnabled())
      logger.debug("Matching record " +
                   PrintMatchListener.toString(record, config.getProperties()) +
                   " found " + candidates.size() + " candidates");

    start = System.currentTimeMillis();
    if (matchall)
      compareCandidatesSimple(record, candidates);
    else
      compareCandidatesBest(record, candidates);
    comparing += System.currentTimeMillis() - start;
  }

  // ===== RECORD LINKAGE STRATEGIES
  // the following two methods implement different record matching
  // strategies. the first is used for deduplication, where we simply
  // want all matches above the thresholds. the second is used for
  // record linkage, to implement a simple greedy matching algorithm
  // where we choose the best alternative above the threshold for each
  // record.

  // other, more advanced possibilities exist for record linkage, but
  // they are not implemented yet. see the links below for more
  // information.

  // http://code.google.com/p/duke/issues/detail?id=55
  // http://research.microsoft.com/pubs/153478/msr-report-1to1.pdf

  /**
   * Passes on all matches found.
   */
  protected void compareCandidatesSimple(Record record,
                                         Collection<Record> candidates) {
    boolean found = false;
    for (Record candidate : candidates) {
      if (isSameAs(record, candidate))
        continue;

      double prob = compare(record, candidate);
      if (prob > config.getThreshold()) {
        found = true;
        registerMatch(record, candidate, prob);
      } else if (config.getMaybeThreshold() != 0.0 &&
                 prob > config.getMaybeThreshold()) {
        found = true; // I guess?
        registerMatchPerhaps(record, candidate, prob);
      }
    }
    if (!found)
      registerNoMatchFor(record);
  }

  /**
   * Passes on only the best match for each record.
   */
  protected void compareCandidatesBest(Record record,
                                         Collection<Record> candidates) {
    double max = 0.0;
    Record best = null;

    // go through all candidates, and find the best
    for (Record candidate : candidates) {
      if (isSameAs(record, candidate))
        continue;

      double prob = compare(record, candidate);
      if (prob > max) {
        max = prob;
        best = candidate;
      }
    }

    // pass on the best match, if any
    if (logger.isDebugEnabled()) {
      logger.debug("Best candidate at " + max + " is " + best);
    }
    if (max > config.getThreshold())
      registerMatch(record, best, max);
    else if (config.getMaybeThreshold() != 0.0 &&
             max > config.getMaybeThreshold())
      registerMatchPerhaps(record, best, max);
    else
      registerNoMatchFor(record);
  }

  /**
   * Compares two records and returns the probability that they
   * represent the same real-world entity.
   */
  public double compare(Record r1, Record r2) {
    comparisons++;
    double prob = 0.5;
    for (String propname : r1.getProperties()) {
      Property prop = config.getPropertyByName(propname);
      if (prop == null)
        continue; // means the property is unknown
      if (prop.isIdProperty() || prop.isIgnoreProperty())
        continue;

      Collection<String> vs1 = r1.getValues(propname);
      Collection<String> vs2 = r2.getValues(propname);
      if (vs1 == null || vs1.isEmpty() || vs2 == null || vs2.isEmpty())
        continue; // no values to compare, so skip

      double high = 0.0;
      for (String v1 : vs1) {
        if (v1.equals("")) // FIXME: these values shouldn't be here at all
          continue;

        for (String v2 : vs2) {
          if (v2.equals("")) // FIXME: these values shouldn't be here at all
            continue;

          try {
            double p = prop.compare(v1, v2);
            high = Math.max(high, p);
          } catch (Exception e) {
            throw new DukeException("Comparison of values '" + v1 + "' and "+
                                    "'" + v2 + "' with " +
                                    prop.getComparator() + " failed", e);
          }
        }
      }

      prob = Utils.computeBayes(prob, high);
    }
    return prob;
  }

  /**
   * Commits all state to disk and frees up resources.
   */
  public void close() {
    database1.close();
    if (hasTwoDatabases())
      database2.close();
  }

  // ===== INTERNALS

  private Iterable<Collection<Record>> makeBatches(Collection<DataSource> sources, int batch_size) {
    return new BatchIterator(sources, batch_size);
  }

  static class BatchIterator implements Iterable<Collection<Record>>,
                                        Iterator<Collection<Record>> {
    private BasicIterator it;
    private int batch_size;

    public BatchIterator(Collection<DataSource> sources, int batch_size) {
      this.it = new BasicIterator(sources);
      this.batch_size = batch_size;
    }

    public boolean hasNext() {
      return it.hasNext();
    }

    public Collection<Record> next() {
      Collection<Record> batch = new ArrayList();
      while (it.hasNext())
        batch.add(it.next());
      return batch;
    }

    public Iterator<Collection<Record>> iterator() {
      return this;
    }

    public void remove() {
      throw new UnsupportedOperationException();
    }
  }

  static class BasicIterator implements Iterator<Record> {
    private Iterator<DataSource> srcit;
    private RecordIterator recit;

    public BasicIterator(Collection<DataSource> sources) {
      this.srcit = sources.iterator();
      findNextIterator();
    }

    public boolean hasNext() {
      return recit.hasNext();
    }

    public Record next() {
      Record r = recit.next();
      if (!recit.hasNext())
        findNextIterator();
      return r;
    }

    private void findNextIterator() {
      if (srcit.hasNext()) {
        DataSource src = srcit.next();
        recit = src.getRecords();
      } else
        recit = new DefaultRecordIterator(Collections.EMPTY_SET.iterator());
    }

    public void remove() {
      throw new UnsupportedOperationException();
    }
  }

  public boolean hasTwoDatabases() {
    return database2 != null;
  }

  private Database getDB(int no) {
    if (no == 1)
      return database1;
    else if (no == 2)
      return database2;
    else
      throw new DukeException("Unknown database " + no);
  }

  private boolean isSameAs(Record r1, Record r2) {
    for (Property idp : config.getIdentityProperties()) {
      Collection<String> vs2 = r2.getValues(idp.getName());
      Collection<String> vs1 = r1.getValues(idp.getName());
      if (vs1 == null)
        continue;
      for (String v1 : vs1)
        if (vs2.contains(v1))
          return true;
    }
    return false;
  }

  private void startProcessing() {
    if (logger.isDebugEnabled())
      logger.debug("Start processing with " + database1 + " and " + database2);

    long start = System.currentTimeMillis();
    for (MatchListener listener : listeners)
      listener.startProcessing();
    callbacks += (System.currentTimeMillis() - start);
  }

  private void endProcessing() {
    long start = System.currentTimeMillis();
    for (MatchListener listener : listeners)
      listener.endProcessing();
    callbacks += (System.currentTimeMillis() - start);
  }

  private void batchReady(int size) {
    long start = System.currentTimeMillis();
    for (MatchListener listener : listeners)
      listener.batchReady(size);
    callbacks += (System.currentTimeMillis() - start);
  }

  private void batchDone() {
    long start = System.currentTimeMillis();
    for (MatchListener listener : listeners)
      listener.batchDone();
    callbacks += (System.currentTimeMillis() - start);
  }

  /**
   * Records the statement that the two records match.
   */
  private void registerMatch(Record r1, Record r2, double confidence) {
    long start = System.currentTimeMillis();
    for (MatchListener listener : listeners)
      listener.matches(r1, r2, confidence);
    callbacks += (System.currentTimeMillis() - start);
  }

  /**
   * Records the statement that the two records may match.
   */
  private void registerMatchPerhaps(Record r1, Record r2, double confidence) {
    long start = System.currentTimeMillis();
    for (MatchListener listener : listeners)
      listener.matchesPerhaps(r1, r2, confidence);
    callbacks += (System.currentTimeMillis() - start);
  }

  /**
   * Notifies listeners that we found no matches for this record.
   */
  private void registerNoMatchFor(Record current) {
    long start = System.currentTimeMillis();
    for (MatchListener listener : listeners)
      listener.noMatchFor(current);
    callbacks += (System.currentTimeMillis() - start);
  }

  /**
   * Sorts properties so that the properties with the lowest low
   * probabilities come first.
   */
  static class PropertyComparator implements Comparator<Property> {
    public int compare(Property p1, Property p2) {
      double diff = p1.getLowProbability() - p2.getLowProbability();
      if (diff < 0)
        return -1;
      else if (diff > 0)
        return 1;
      else
        return 0;
    }
  }

  // ===== THREADS

  /**
   * The thread that actually runs parallell matching. It holds the
   * thread's share of the current batch.
   */
  class MatchThread extends Thread {
    private Collection<Record> records;
    private boolean matchall;

    public MatchThread(int threadno, int recordcount, boolean matchall) {
      super("MatchThread " + threadno);
      this.records = new ArrayList(recordcount);
      this.matchall = matchall;
    }

    public void run() {
      for (Record record : records)
        match(1, record, matchall);
    }

    public void addRecord(Record record) {
      records.add(record);
    }
  }

  // ===== PERFORMANCE PROFILING

  public class Profiler extends AbstractMatchListener {
    private long processing_start;
    private long batch_start;
    private int batch_size;
    private int records;
    private PrintWriter out;

    public Profiler() {
      this.out = new PrintWriter(System.out);
    }

    /**
     * Sets Writer to receive performance statistics. Defaults to
     * System.out.
     */
    public void setOutput(Writer outw) {
      this.out = new PrintWriter(outw);
    }

    public void startProcessing() {
      processing_start = System.currentTimeMillis();
      System.out.println("Duke version " + Duke.getVersionString());
      System.out.println(getDatabase());
      if (hasTwoDatabases())
        System.out.println(database2);
      System.out.println("Threads: " + getThreads());
    }

    public void batchReady(int size) {
      batch_start = System.currentTimeMillis();
      batch_size = size;
    }

    public void batchDone() {
      records += batch_size;
      int rs = (int) ((1000.0 * batch_size) /
                      (System.currentTimeMillis() - batch_start));
      System.out.println("" + records + " processed, " + rs +
                         " records/second; comparisons: " +
                         getComparisonCount());
    }

    public void endProcessing() {
      long end = System.currentTimeMillis();
      double rs = (1000.0 * records) / (end - processing_start);
      System.out.println("Run completed, " + (int) rs + " records/second");
      System.out.println("" + records + " records total in " +
                         ((end - processing_start) / 1000) + " seconds");

      long total = srcread + indexing + searching + comparing + callbacks;
      System.out.println("Reading from source: " +
                         seconds(srcread) + " (" +
                         percent(srcread, total) + "%)");
      System.out.println("Indexing: " +
                         seconds(indexing) + " (" +
                         percent(indexing, total) + "%)");
      System.out.println("Searching: " +
                         seconds(searching) + " (" +
                         percent(searching, total) + "%)");
      System.out.println("Comparing: " +
                         seconds(comparing) + " (" +
                         percent(comparing, total) + "%)");
      System.out.println("Callbacks: " +
                         seconds(callbacks) + " (" +
                         percent(callbacks, total) + "%)");
      System.out.println();
      Runtime r = Runtime.getRuntime();
      System.out.println("Total memory: " + r.totalMemory() + ", " +
                         "free memory: " + r.freeMemory() + ", " +
                         "used memory: " + (r.totalMemory() - r.freeMemory()));
    }

    private String seconds(long ms) {
      return "" + (int) (ms / 1000);
    }

    private String percent(long ms, long total) {
      return "" + (int) ((double) (ms * 100) / (double) total);
    }
  }
}