package no.priv.garshol.duke.cleaners; import java.text.Normalizer; import no.priv.garshol.duke.Cleaner; /** * A cleaner which removes leading and trailing whitespace, normalized * internal whitespace, lowercases all characters, and (by default) * strips accents. This is the most commonly used cleaner for textual * data. */ public class LowerCaseNormalizeCleaner implements Cleaner { private boolean strip_accents = true; /** * Controls whether accents are stripped (that is, "é" becomes "e", * and so on). The default is true. */ public void setStripAccents(boolean strip_accents) { this.strip_accents = strip_accents; } public String clean(String value) { if (strip_accents) // after this, accents will be represented as separate combining // accent characters trailing the character they belong with. the // next step will strip them out. value = Normalizer.normalize(value, Normalizer.Form.NFD); char[] tmp = new char[value.length()]; int pos = 0; boolean prevws = false; for (int ix = 0; ix < tmp.length; ix++) { char ch = value.charAt(ix); // we make an exception for \u030A (combining ring above) when // following 'a', because this is a Scandinavian character that // should *not* be normalized if (ch == 0x030A && (value.charAt(ix - 1) == 'a' || value.charAt(ix - 1) == 'A')) { prevws = false; // this overwrites the previously written 'a' with 'aa' tmp[pos - 1] = '\u00E5'; continue; } // if character is combining diacritical mark, skip it. if ((ch >= 0x0300 && ch <= 0x036F) || (ch >= 0x1DC0 && ch <= 0x1DFF) || (ch >= 0x20D0 && ch <= 0x20FF) || (ch >= 0xFE20 && ch <= 0xFE2F)) continue; // whitespace processing if (ch != ' ' && ch != '\t' && ch != '\n' && ch != '\r' && ch != 0xA0 /* NBSP */) { if (prevws && pos != 0) tmp[pos++] = ' '; tmp[pos++] = Character.toLowerCase(ch); prevws = false; } else prevws = true; } return new String(tmp, 0, pos); } }