package no.priv.garshol.duke.comparators; import java.util.List; import java.util.ArrayList; import java.util.Collections; import no.priv.garshol.duke.Comparator; import no.priv.garshol.duke.utils.StringUtils; /** * A tokenized approach to string similarity, based on Jaccard * equivalence and the Jaro-Winkler metric. * * FIXME: Do we actually need this, or is DiceCoefficientComparator * better? I guess Dice probably is better. However, the code for not * allowing same token to be matched twice is unique to this comparator. * Should we reuse in Dice, or just support more methods than just Dice? */ public class JaroWinklerTokenized implements Comparator { public boolean isTokenized() { return true; } public double compare(String s1, String s2) { if (s1.equals(s2)) return 1.0; // tokenize String[] t1 = StringUtils.split(s1); String[] t2 = StringUtils.split(s2); // ensure that t1 is shorter than or same length as t2 if (t1.length > t2.length) { String[] tmp = t2; t2 = t1; t1 = tmp; } // compute all comparisons List<Match> matches = new ArrayList(t1.length * t2.length); for (int ix1 = 0; ix1 < t1.length; ix1++) for (int ix2 = 0; ix2 < t2.length; ix2++) matches.add(new Match(JaroWinkler.similarity(t1[ix1], t2[ix2]), ix1, ix2)); // sort Collections.sort(matches); // now pick the best matches, never allowing the same token to be // included twice. we mark a token as used by nulling it in t1|t2. double sum = 0.0; for (Match m : matches) { if (t1[m.ix1] != null && t2[m.ix2] != null) { sum += m.score; t1[m.ix1] = null; t2[m.ix2] = null; } } return sum / t1.length; } static class Match implements Comparable { double score; int ix1; int ix2; public Match(double score, int ix1, int ix2) { this.score = score; this.ix1 = ix1; this.ix2 = ix2; } public int compareTo(Object other) { if (!(other instanceof Match)) return -1; double oscore = ((Match) other).score; if (score < oscore) return 1; else if (score > oscore) return -1; else return 0; } } // THE OLD CODE // public double compare(String s1, String s2) { // if (s1.equals(s2)) // return 1.0; // // tokenize // String[] t1 = StringUtils.split(s1); // String[] t2 = StringUtils.split(s2); // // ensure that t1 is shorter than or same length as t2 // if (t1.length > t2.length) { // String[] tmp = t2; // t2 = t1; // t1 = tmp; // } // // find best matches for each token in t1 // double sum = 0; // for (int ix1 = 0; ix1 < t1.length; ix1++) { // double highest = 0; // for (int ix2 = 0; ix2 < t2.length; ix2++) // highest = Math.max(highest, JaroWinkler.similarity(t1[ix1], t2[ix2])); // sum += highest; // } // return sum / t1.length; // } }