package cc.mallet.pipe; import cc.mallet.types.*; import java.util.HashSet; import java.util.ArrayList; import java.io.*; /** * A simple unicode tokenizer that accepts sequences of letters * as tokens. */ public class SimpleTokenizer extends Pipe { public static final int USE_EMPTY_STOPLIST = 0; public static final int USE_DEFAULT_ENGLISH_STOPLIST = 1; protected HashSet<String> stoplist; public SimpleTokenizer(int languageFlag) { stoplist = new HashSet<String>(); if (languageFlag == USE_DEFAULT_ENGLISH_STOPLIST) { // articles stop("the"); stop("a"); stop("an"); // conjunctions stop("and"); stop("or"); // prepositions stop("of"); stop("for"); stop("in"); stop("on"); stop("to"); stop("with"); stop("by"); // definite pronouns stop("this"); stop("that"); stop("these"); stop("those"); stop("some"); stop("other"); // personal pronouns stop("it"); stop("its"); stop("we"); stop("our"); // conjuctions stop("as"); stop("but"); stop("not"); // verbs stop("do"); stop("does"); stop("is"); stop("be"); stop("are"); stop("can"); stop("was"); stop("were"); } } public SimpleTokenizer(File stopfile) { stoplist = new HashSet<String>(); try { BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(stopfile), "UTF-8")); String word = null; while ((word = in.readLine()) != null) { stop(word); } in.close(); } catch (Exception e) { System.err.println("problem loading stoplist: " + e); } } public SimpleTokenizer(HashSet<String> stoplist) { this.stoplist = stoplist; } public SimpleTokenizer deepClone() { return new SimpleTokenizer((HashSet<String>) stoplist.clone()); } public void stop(String word) { stoplist.add(word); } public Instance pipe(Instance instance) { if (instance.getData() instanceof CharSequence) { CharSequence characters = (CharSequence) instance.getData(); ArrayList<String> tokens = new ArrayList<String>(); int[] tokenBuffer = new int[1000]; int length = -1; // Using code points instead of chars allows us // to support extended Unicode, and has no significant // efficiency costs. int totalCodePoints = Character.codePointCount(characters, 0, characters.length()); for (int i=0; i < totalCodePoints; i++) { int codePoint = Character.codePointAt(characters, i); int codePointType = Character.getType(codePoint); if (codePointType == Character.LOWERCASE_LETTER || codePointType == Character.UPPERCASE_LETTER) { length++; tokenBuffer[length] = codePoint; } else if (codePointType == Character.SPACE_SEPARATOR || codePointType == Character.LINE_SEPARATOR || codePointType == Character.PARAGRAPH_SEPARATOR || codePointType == Character.END_PUNCTUATION || codePointType == Character.DASH_PUNCTUATION || codePointType == Character.CONNECTOR_PUNCTUATION || codePointType == Character.START_PUNCTUATION || codePointType == Character.INITIAL_QUOTE_PUNCTUATION || codePointType == Character.FINAL_QUOTE_PUNCTUATION || codePointType == Character.OTHER_PUNCTUATION) { // Things that delimit words if (length != -1) { String token = new String(tokenBuffer, 0, length + 1); if (! stoplist.contains(token)) { tokens.add(token); } length = -1; } } else if (codePointType == Character.COMBINING_SPACING_MARK || codePointType == Character.ENCLOSING_MARK || codePointType == Character.NON_SPACING_MARK || codePointType == Character.TITLECASE_LETTER || codePointType == Character.MODIFIER_LETTER || codePointType == Character.OTHER_LETTER) { // Obscure things that are technically part of words. // Marks are especially useful for Indic scripts. length++; tokenBuffer[length] = codePoint; } else { // Character.DECIMAL_DIGIT_NUMBER // Character.CONTROL // Character.MATH_SYMBOL //System.out.println("type " + codePointType); } } if (length != -1) { String token = new String(tokenBuffer, 0, length + 1); if (! stoplist.contains(token)) { tokens.add(token); } } instance.setData(tokens); } else { throw new IllegalArgumentException("Looking for a CharSequence, found a " + instance.getData().getClass()); } return instance; } static final long serialVersionUID = 1; }