package no.priv.garshol.duke.comparators; import no.priv.garshol.duke.Comparator; // http://www.wbrogden.com/java/Phonetic/index.html // http://www.wbrogden.com/phonetic/index.html /** * An implementation of the Metaphone algorithm, and a comparator * which considers strings to have a score of 0.9 if their Metaphone * values match. */ public class MetaphoneComparator implements Comparator { public double compare(String s1, String s2) { if (s1.equals(s2)) return 1.0; if (metaphone(s1).equals(metaphone(s2))) return 0.9; return 0.0; } public boolean isTokenized() { return true; // I guess? } /** * Produces the Metaphone key for the given string. */ public static String metaphone(String str) { if (str.length() < 1) return ""; // no metaphone key for the empty string str = str.toUpperCase(); char[] key = new char[str.length() * 2]; // could be all X-es int pos = 0; for (int ix = 0; ix < str.length(); ix++) { char ch = str.charAt(ix); if (isVowel(ch) && ch != 'Y') { if (ix != 0) ch = ' '; // meaning: skip // Initial ae- -> drop first letter else if (ix == 0 && ch == 'A' && str.length() > 1 && str.charAt(ix + 1) == 'E') { ch = 'E'; ix++; } } else { // skip double consonant if (ch != 'C' && ix + 1 < str.length() && str.charAt(ix + 1) == ch) ch = str.charAt(++ix); switch(ch) { case 'B': // B -> B unless at the end of a word after "m" as in "dumb" if (ix + 1 == str.length() && ix != 0 && str.charAt(ix - 1) == 'M') ch = ' '; // skip break; case 'C': // C -> X (sh) if -cia- or -ch- // S if -ci-, -ce- or -cy- // K otherwise, including -sch- ch = 'K'; // default if (ix > 0 && str.charAt(ix - 1) == 'S' && ix + 1 < str.length() && str.charAt(ix + 1) == 'H') ix++; // skip the 'H' else if (ix + 1 < str.length()) { char next = str.charAt(ix + 1); if (next == 'I' && ix + 2 < str.length() && str.charAt(ix + 2) == 'A') ch = 'X'; else if (next == 'I' || next == 'E' || next == 'Y') ch = 'S'; else if (next == 'H') { ch = 'X'; ix++; // we need to skip the H } } break; case 'D': // D -> J if in -dge-, -dgy- or -dgi- // T otherwise if (ix + 2 < str.length() && str.charAt(ix + 1) == 'G' && (str.charAt(ix + 2) == 'E' || str.charAt(ix + 2) == 'Y' || str.charAt(ix + 2) == 'I')) { ch = 'J'; ix += 2; // skip over next } else ch = 'T'; break; case 'G': // G -> silent if in -gh- and not at end or before a vowel // in -gn- or -gned- (also see dge etc. above) // J if before i or e or y if not double gg // K otherwise // Initial gn- pn, ae- or wr- -> drop first letter ch = 'K'; if (ix == 0 && str.length() > 1 && str.charAt(ix + 1) == 'N') ch = ' '; else if (ix + 1 < str.length() && str.charAt(ix + 1) == 'H') { if (ix + 2 == str.length() || (ix + 2 < str.length() && isVowel(str.charAt(ix + 2)))) { // not at end ch = ' '; // skip ix++; // skip the 'H', too } } else if (ix + 1 < str.length() && str.charAt(ix + 1) == 'N') ch = ' '; // skip else if (ix + 1 < str.length() && (str.charAt(ix + 1) == 'I' || str.charAt(ix + 1) == 'E' || str.charAt(ix + 1) == 'Y') && (ix == 0 || str.charAt(ix - 1) != 'G')) ch = 'J'; break; case 'H': // H -> silent if after vowel and no vowel follows // H otherwise if (ix > 0 && isVowel(str.charAt(ix - 1)) && ix + 1 < str.length() && !isVowel(str.charAt(ix + 1))) ch = ' '; // silent break; case 'K': // K -> silent if after "c" // K otherwise // Initial kn-, gn- pn, ae- or wr- -> drop first letter if ((ix > 0 && str.charAt(ix - 1) == 'C') || (ix == 0 && str.length() > 1 && str.charAt(ix + 1) == 'N')) ch = ' '; // silent break; case 'P': // P -> F if before "h" // P otherwise // Initial pn, ae- or wr- -> drop first letter if (ix == 0 && str.length() > 1 && str.charAt(ix + 1) == 'N') ch = ' '; else if (ix + 1 < str.length() && str.charAt(ix + 1) == 'H') { ch = 'F'; ix++; // skip the following 'H' } break; case 'Q': ch = 'K'; break; case 'S': // S -> X (sh) if before "h" or in -sio- or -sia- // S otherwise if ((ix + 1 < str.length() && str.charAt(ix + 1) == 'H') || (ix + 2 < str.length() && str.charAt(ix + 1) == 'I' && (str.charAt(ix + 2) == 'O' || str.charAt(ix + 2) == 'A'))) { ch = 'X'; ix++; // skip the 'H', too } break; case 'T': // T -> X (sh) if -tia- or -tio- // 0 (th) if before "h" // silent if in -tch- // T otherwise if (ix + 2 < str.length() && str.charAt(ix + 1) == 'I' && (str.charAt(ix + 2) == 'A' || str.charAt(ix + 2) == 'O')) ch = 'X'; else if (ix + 1 < str.length() && str.charAt(ix + 1) == 'H') { ch = '0'; ix++; // skip the 'H' } else if (ix + 2 < str.length() && str.charAt(ix + 1) == 'C' && str.charAt(ix + 2) == 'H') ch = ' '; break; case 'V': ch = 'F'; break; case 'W': // W -> silent if not followed by a vowel // W if followed by a vowel // Initial wh- -> change to "w" // Initial wr- -> drop first letter if (ix == 0 && str.length() > 1 && str.charAt(ix + 1) == 'H') ix++; // skip the 'H' else if (ix == 0 && str.length() > 1 && str.charAt(ix + 1) == 'R') ch = ' '; // drop the 'W' else if (ix + 1 < str.length() && !isVowel(str.charAt(ix + 1))) ch = ' '; break; case 'X': // Initial x- -> change to "s" if (ix > 0) key[pos++] = 'K'; ch = 'S'; break; case 'Y': // Y -> silent if not followed by a vowel // Y if followed by a vowel if ((ix + 1 < str.length() && !isVowel(str.charAt(ix + 1))) || ix + 1 == str.length()) ch = ' '; break; case 'Z': ch = 'S'; } } if (ch != ' ') key[pos++] = ch; } return new String(key, 0, pos); } private static boolean isVowel(char ch) { return (ch == 'A' || ch == 'E' || ch == 'I' || ch == 'O' || ch == 'U' || ch == 'Y'); } }