package org.exist.storage.analysis;
/**
* This is the default class used by the fulltext indexer for
* tokenizing a string into words. Known token types are defined
* by class Token.
*
*@author Wolfgang Meier
*/
public class SimpleTokenizer implements Tokenizer {
private int pos = 0;
private boolean stem = false;
private CharSequence text;
private int len = 0;
private final TextToken temp = new TextToken();
public SimpleTokenizer() {
}
public SimpleTokenizer(boolean stem) {
this.stem = stem;
}
public void setStemming(boolean stem) {
this.stem = stem;
}
private final char LA(int i) {
final int current = pos + i;
return current > len ? (char) -1 : text.charAt(current - 1);
}
protected TextToken alpha(TextToken token, boolean allowWildcards) {
if (token == null)
token = new TextToken(TextToken.ALPHA, text, pos);
else
token.setType(TextToken.ALPHA);
// consume letters
char ch = LA(1);
int count = 0;
while (ch != (char) -1) {
if (ch == '\\' && isWildcard(LA(2))) {
break;
} else if (ch > '\u2E80' && singleCharToken(ch)) {
// if this is a single char token and first in the sequence,
// consume it
if(count == 0) {
token.consumeNext();
consume();
ch = LA(1);
}
break;
} else if (Character.isLetter(ch) || is_mark(ch) || nonBreakingChar(ch) || (allowWildcards && isWildcard(ch))) {
token.consumeNext();
consume();
ch = LA(1);
count++;
} else {
break;
}
}
if (Character.isDigit(ch)) {
// found non-letter character
// call alphanum()
return alphanum(token, allowWildcards);
}
return token;
}
private final static boolean isWildcard(char ch) {
if (ch == '?' || ch == '*')
return true;
return false;
}
protected TextToken alphanum(TextToken token, boolean allowWildcards) {
if (token == null)
token = new TextToken(TextToken.ALPHANUM, text, pos);
else
token.setType(TextToken.ALPHANUM);
while (LA(1) != (char) - 1) {
if (Character.isLetterOrDigit(LA(1))) {
token.consumeNext();
consume();
} else if (allowWildcards && isWildcard(LA(1))) {
token.consumeNext();
consume();
continue;
} else
break;
}
return token;
}
protected void consume() {
pos++;
}
protected TextToken eof() {
consume();
return TextToken.EOF_TOKEN;
}
public int getLength() {
return len;
}
public String getText() {
return text.toString();
}
protected TextToken nextTerminalToken(boolean wildcards) {
TextToken token = null;
char ch = LA(1);
if (ch == (char) - 1)
return eof();
if (Character.isLetter(ch) || is_mark(ch) || nonBreakingChar(ch)
|| singleCharToken(ch)
|| (wildcards && isWildcard(ch))) {
token = alpha(null, wildcards);
}
if (token == null
&& (Character.isLetterOrDigit(ch)
|| (wildcards && isWildcard(ch))))
token = alphanum(null, wildcards);
if (token == null)
switch (ch) {
case '\\':
if(isWildcard(LA(2))) {
consume();
}
case '*' :
case ',' :
case '-' :
case '_' :
case ':' :
case '.' :
case '@' :
case '/' :
token = p();
break;
default :
token = whitespace();
break;
}
return token;
}
public TextToken nextToken() {
return nextToken(false);
}
public TextToken nextToken(boolean wildcards) {
try {
while (true) {
TextToken token = nextTerminalToken(wildcards);
TextToken next;
int oldPos = pos;
char LA1 = LA(1);
switch (token.getType()) {
case TextToken.EOF :
return null;
case TextToken.ALPHA :
switch (LA1) {
// text with apostrophe like Peter's
case '\'' :
consume();
next = nextTerminalToken(wildcards);
if (next != null
&& next.getType() == TextToken.ALPHA) {
return new TextToken(TextToken.ALPHA, text, token.startOffset(), next.endOffset());
}
pos = oldPos;
break;
// text with some alphanumeric sequence attached
// handles URL's, email addresses, dates or general sequences like
// like Q/22/A4.5 or 12/09/1989
case '_' :
case ':' :
case '.' :
case '/' :
case '@' :
if (LA(2) == (char) - 1
|| Character.isWhitespace(LA(2))) {
consume();
break;
}
TextToken last = null;
while ((next = nextTerminalToken(wildcards))
!= null) {
if (next.getType() == TextToken.EOF
|| next.getType() == TextToken.WS)
break;
if(next.getType() == TextToken.P &&
(LA(2) == (char)-1 || Character.isWhitespace(LA(2))))
break;
last = next;
}
if (last != null)
token =
new TextToken(
TextToken.ALPHANUM,
text, token.startOffset(), last.endOffset());
else
pos = oldPos;
}
return token;
case TextToken.ALPHANUM :
switch (LA1) {
case '/' :
case '*' :
case ',' :
case '-' :
case '_' :
case ':' :
case '.' :
case '@' :
if (LA(2) == (char) - 1
|| Character.isWhitespace(LA(2))) {
consume();
break;
}
TextToken last = null;
while ((next = nextTerminalToken(wildcards)) != null) {
if (next.getType() == TextToken.EOF
|| next.getType() == TextToken.WS)
break;
last = next;
}
if (last != null)
token = new TextToken(TextToken.ALPHANUM, text, token.startOffset(), last.endOffset());
else
token = new TextToken(TextToken.ALPHANUM, text, token.startOffset(), pos);
}
return token;
default :
// fall through to start of while loop
}
}
} catch (Exception e) {
System.out.println("text: " + text);
e.printStackTrace();
return null;
}
}
protected TextToken number() {
TextToken token = new TextToken(TextToken.NUMBER, text, pos);
int oldPos = pos;
while (LA(1) != (char) - 1 && Character.isDigit(LA(1))) {
token.consumeNext();
consume();
}
if (Character.isLetter(LA(1))) {
pos = oldPos;
return null;
}
return token;
}
protected TextToken p() {
temp.set(TextToken.P, text, pos);
temp.consumeNext();
consume();
return temp;
}
public void setText(CharSequence text) {
pos = 0;
len = text.length();
this.text = text;
}
public void setText(CharSequence text, int offset) {
pos = offset;
len = text.length();
this.text = text;
}
protected TextToken whitespace() {
consume();
return TextToken.WS_TOKEN;
}
/**
* The code ranges defined here should be interpreted as 1-char
* tokens.
*/
private static final boolean singleCharToken(char ch) {
return
// CJK Radicals Supplement
(ch >= '\u2E80' && ch <= '\u2EFF') ||
// KangXi Radicals
(ch >= '\u2F00' && ch <= '\u2FDF') ||
// Ideographic Description Characters
(ch >= '\u2FF0' && ch <= '\u2FFF') ||
// Enclosed CJK Letters and Months
(ch >= '\u3200' && ch <= '\u32FF') ||
// CJK Compatibility
(ch >= '\u3300' && ch <= '\u33FF') ||
// CJK Unified Ideographs Extension A
(ch >= '\u3400' && ch <= '\u4DB5') ||
// Yijing Hexagram Symbols
(ch >= '\u4DC0' && ch <= '\u4DFF') ||
// CJK Unified Ideographs
(ch >= '\u4E00' && ch <= '\u9FFF') ||
// CJK Compatibility Ideographs
(ch >= '\uF900' && ch <= '\uFAFF') ||
// CJK Compatibility Forms
(ch >= '\uFE30' && ch <= '\uFE4F');
}
/**
* These codepoints should not be broken into tokens.
*/
private final static boolean nonBreakingChar(char ch) {
return
// Hiragana
(ch >= '\u3040' && ch <= '\u309F') ||
// Katakana
(ch >= '\u30A0' && ch <= '\u30FF') ||
// Bopomofo
(ch >= '\u3100' && ch <= '\u312F') ||
// Hangul Compatibility Jamo
(ch >= '\u3130' && ch <= '\u318F') ||
// Kanbun
(ch >= '\u3190' && ch <= '\u319F') ||
// Bopomofo Extended
(ch >= '\u31A0' && ch <= '\u31BF') ||
// Katakana Phonetic Extensions
(ch >= '\u31F0' && ch <= '\u31FF') ||
// Hangul Syllables
(ch >= '\uAC00' && ch <= '\uD7A3');
}
private final boolean is_mark(char ch) {
return (ch > '\u093d' && ch < '\u094c');
}
public static void main(String args[]) {
String t1 = "\u4ED6\u4E3A\u8FD9\u9879\u5DE5\u7A0B\u6295\u5165\u4E86\u5341\u4E09\u5E74\u65F6\u95F4\u3002";
SimpleTokenizer tokenizer = new SimpleTokenizer();
tokenizer.setText(t1);
TextToken token = tokenizer.nextToken(false);
while(token != null && token.getType() != TextToken.EOF) {
System.out.println(token.getText());
token = tokenizer.nextToken(false);
}
}
}