// x = the sound of both sj and kj // ^ == start of string // $ == end of string // [abc] == a set of characters (as in regexp) // IMPLEMENTED // vowels stripped, except initial vowel // double consonants collapse into one // ^aa -> å // ch -> k // ck -> k // [oiuaeæødy]d -> // dt$ -> t // gh -> k // gj -> j // ^gi -> j // hg -> k // hj -> j // hl -> l // hr -> r // kj -> x // ki -> x // ld -> l // nd -> n // ph -> f // th -> t // w -> v // x -> ks // z -> s // NOT IMPLEMENTED // ^c -> k // sj -> x // skj -> x // ^ei -> æ // d -> t // g -> k // kei -> x // skei -> x // ^ky -> x // ^sky -> x // NOT SURE ABOUT THESE // ^ch[aeiouy] -> x (charlotte) // en$ -> package no.priv.garshol.duke.comparators; import no.priv.garshol.duke.Comparator; /** * My own algorithm for phonetic matching of Norwegian names, inspired * by Metaphone. */ public class NorphoneComparator implements Comparator { public double compare(String s1, String s2) { if (s1.equals(s2)) return 1.0; if (norphone(s1).equals(norphone(s2))) return 0.9; return 0.0; } public boolean isTokenized() { return false; } /** * Produces the Norphone key for the given string. */ public static String norphone(String str) { if (str.length() < 1) return ""; // no norphone key for the empty string str = str.toUpperCase(); char[] key = new char[str.length() * 2]; // could be all X-es int pos = 0; Matcher m = new Matcher(str); while (m.hasNext()) { char ch = m.next(); // discard duplicate characters if (m.isNext(ch) && ch != 'A') ch = ' '; // discard vowels else if (isVowel(ch) && !m.atStart()) ch = ' '; else { switch(ch) { case 'A': // we only come here on the first character if (m.isNext('A')) ch = '\u00C5'; // Å break; case 'C': if (m.isNext('H') || m.isNext('K')) { ch = 'K'; m.skip(); } else ch = 'K'; break; case 'D': if (m.isNext('T') && m.nextIsLast()) { ch = 'T'; m.skip(); } else if (m.previousOneOf("IOUAEY\u00D8\u00C6\u00C5") && m.isLast()) ch = ' '; break; case 'G': if (m.isNext('H')) m.skip(); // 'H' is silent else if (m.isNext('J') || (m.isNext('I') && m.atStart())) { ch = 'J'; m.skip(); } break; case 'H': if (m.isNext('J')) { ch = 'J'; m.skip(); } else if (m.isNext('L')) { ch = 'L'; m.skip(); } else if (m.isNext('G')) { ch = 'G'; m.skip(); } else if (m.isNext('R')) { ch = 'R'; m.skip(); } break; case 'K': if (m.isNext('J') || m.isNext('I')) { ch = 'X'; m.skip(); } break; case 'L': if (m.isNext('D') && m.nextIsLast()) { ch = 'L'; m.skip(); } break; case 'N': if (m.isNext('D')) m.skip(); break; case 'P': if (m.isNext('H')) { ch = 'F'; m.skip(); // eat the 'H' } break; case 'T': if (m.isNext('H')) m.skip(); // 'H' is silent break; case 'W': ch = 'V'; break; case 'X': key[pos++] = 'K'; ch = 'S'; break; case 'Z': ch = 'S'; break; } } if (ch != ' ') key[pos++] = ch; } return new String(key, 0, pos); } private static boolean isVowel(char ch) { return (ch == 'A' || ch == 'E' || ch == 'I' || ch == 'O' || ch == 'U' || ch == 'Y' || ch == '\u00C5' || ch == '\u00C6' || ch == '\u00D8'); } }