package edu.harvard.wcfia.yoshikoder.document.tokenizer;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* A tokenizer that returns either token spans, or whole tokens (backed by the text handed in)
*
* @author will
*
*/
public class WordTokenizer {
private static Logger log =
Logger.getLogger("edu.harvard.wcfia.yoshikoder.document.tokenizer.WordTokenizer");
protected Locale locale;
protected BreakIterator wordIterator;
public WordTokenizer(Locale loc){
if (loc == null){
locale = Locale.getDefault();
log.info("Null handed in as Locale, using default: " + locale.toString());
} else
locale = loc;
wordIterator = BreakIterator.getWordInstance(locale);
}
public int [][] getTokenSpans(String txt){
wordIterator.setText(txt);
List list = new ArrayList();
int start = wordIterator.first();
int end = wordIterator.next();
while (end != BreakIterator.DONE) {
char c = txt.charAt(start);
if (Character.isLetterOrDigit(c) ||
Character.getType(c)==Character.CURRENCY_SYMBOL)
list.add(new int[]{start, end});
start = end;
try {
end = wordIterator.next(); // throws exceptions rarely
} catch (Exception e) {
log.log(Level.WARNING,
"tokenization error somewhere after character " + end,
e);
}
}
wordIterator.setText(""); // drop any references to documents we might be keeping
return (int[][])list.toArray(new int[list.size()][2]);
}
public String[] getTokens(String txt){
int[][] spans = getTokenSpans(txt);
String[] s = new String[spans.length];
for (int ii = 0; ii < s.length; ii++) {
s[ii] = txt.substring(spans[ii][0], spans[ii][1]);
}
return s;
}
}