package no.priv.garshol.duke.cleaners; import java.util.Map; import java.util.HashMap; import java.io.InputStream; import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.IOException; import no.priv.garshol.duke.Cleaner; import no.priv.garshol.duke.utils.StringUtils; /** * <b>Experimental</b> cleaner for person names, which understands * about abbreviations like "joe" for "joseph", etc. */ public class PersonNameCleaner implements Cleaner { private LowerCaseNormalizeCleaner sub; private Map<String, String> mapping; public PersonNameCleaner() { this.sub = new LowerCaseNormalizeCleaner(); // load token translation mapping (FIXME: move to static init?) try { this.mapping = loadMapping(); } catch (IOException e) { throw new RuntimeException(e); } } public String clean(String value) { // do basic cleaning value = sub.clean(value); if (value == null || value.equals("")) return value; // tokenize, then map tokens, then rejoin String[] tokens = StringUtils.split(value); for (int ix = 0; ix < tokens.length; ix++) { String mapsto = mapping.get(tokens[ix]); if (mapsto != null) tokens[ix] = mapsto; } return StringUtils.join(tokens); } private Map<String, String> loadMapping() throws IOException { String mapfile = "no/priv/garshol/duke/name-mappings.txt"; Map<String, String> mapping = new HashMap(); ClassLoader cloader = Thread.currentThread().getContextClassLoader(); InputStream istream = cloader.getResourceAsStream(mapfile); InputStreamReader reader = new InputStreamReader(istream, "utf-8"); BufferedReader in = new BufferedReader(reader); String line = in.readLine(); while (line != null) { int pos = line.indexOf(','); mapping.put(line.substring(0, pos), line.substring(pos + 1)); line = in.readLine(); } in.close(); return mapping; } }