package no.priv.garshol.duke.comparators;
import no.priv.garshol.duke.Comparator;
import no.priv.garshol.duke.DukeConfigException;
/**
* <p>An implementation of the longest common substring comparator. Note
* that it does not merely find the longest common substring, but does
* so repeatedly down to a minimal substring length.
*
* <p>Described in P. Christen, chapter 5.9. Also in Tolerating spelling
* errors during patient validation; Friedman C, Sideli R.; Comput
* Biomed Res. 1992 Oct;25(5):486-509.
* http://www.cs.utah.edu/contest/2005/spellingErrors.pdf
*
* @since 1.2
*/
public class LongestCommonSubstring implements Comparator {
private int minlen = 2;
private Formula formula = Formula.OVERLAP;
public double compare(String s1, String s2) {
// a couple of quick cutoffs
if (s1.equals(s2))
return 1.0;
if (Math.min(s1.length(), s2.length()) == 0)
return 0.0;
// the results of the algorithm depends on the order of the input
// strings. therefore need a sub-method for this computation
return (compare_(s1, s2) + compare_(s2, s1)) / 2.0;
}
// FIXME: speed this up by using a one-dimensional array
private double compare_(String s1, String s2) {
// before we begin, note the length of the strings
int shortlen = Math.min(s1.length(), s2.length());
int longlen = Math.max(s1.length(), s2.length());
int removed = 0; // total length of common substrings
while (true) {
// first, we identify the longest common substring
int longest = 0;
int longesti = 0;
int longestj = 0;
int[][] matrix = new int[s1.length()][s2.length()];
for (int i = 0; i < s1.length(); i++) {
for (int j = 0; j < s2.length(); j++) {
if (s1.charAt(i) == s2.charAt(j)) {
if (i == 0 || j == 0)
matrix[i][j] = 1;
else
matrix[i][j] = matrix[i - 1][j - 1] + 1;
if (matrix[i][j] > longest) {
longest = matrix[i][j];
longesti = i;
longestj = j;
}
} else
matrix[i][j] = 0;
}
}
longesti++; // this solves an off-by-one problem
longestj++; // this solves an off-by-one problem
// at this point we know the length of the longest common
// substring, and also its location, since it ends at indexes
// longesti and longestj.
if (longest < minlen)
break; // all remaining common substrings are too short, so we stop
// now we slice away the common substrings
s1 = s1.substring(0, longesti - longest) + s1.substring(longesti);
s2 = s2.substring(0, longestj - longest) + s2.substring(longestj);
removed += longest;
}
return formula.compute(removed, shortlen, longlen);
}
public boolean isTokenized() {
return true;
}
public void setMinimumLength(int minlen) {
this.minlen = minlen;
}
public int getMinimumLength() {
return this.minlen;
}
public void setFormula(Formula formula) {
this.formula = formula;
}
public Formula getFormula() {
return formula;
}
/**
* Represents the different formulas we can use to compute similarity.
*/
public enum Formula {
OVERLAP {
public double compute(int removed, int shortlen, int longlen) {
return removed / (double) shortlen;
}
}, DICE {
public double compute(int removed, int shortlen, int longlen) {
return 2*removed / (double) (shortlen + longlen);
}
}, JACCARD {
public double compute(int removed, int shortlen, int longlen) {
return removed / (double) (shortlen + longlen - removed);
}
};
public double compute(int removed, int shortlen, int longlen) {
throw new DukeConfigException("Unknown formula: " + this);
}
}
}