/**
*
*/
package org.voyanttools.trombone.lucene.analysis.icu;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
/**
* @author sgs
*
*/
public class TromboneICUTokenizerConfig extends DefaultICUTokenizerConfig {
private String language;
private static String TIBETAN = "bo";
private static BreakIterator TROMBONE_WORD_BREAK_ITERATOR;
public TromboneICUTokenizerConfig(boolean cjkAsWords, boolean myanmarAsWords, String language) {
super(cjkAsWords, myanmarAsWords);
this.language = language;
InputStream is = this.getClass().getResourceAsStream("tromboneDefault.rbbi");
String rules;
try {
rules = IOUtils.toString(is);
} catch (IOException e) {
throw new RuntimeException("Unable to load trombone break iterator rules.", e);
}
IOUtils.closeQuietly(is);
TROMBONE_WORD_BREAK_ITERATOR = new RuleBasedBreakIterator(rules);
}
@Override
public BreakIterator getBreakIterator(int script) {
if (language.equals(TIBETAN)) {
return (BreakIterator) TROMBONE_WORD_BREAK_ITERATOR.clone();
} else {
return super.getBreakIterator(script);
}
}
public static void main (String[] args) {
TromboneICUTokenizerConfig config = new TromboneICUTokenizerConfig(true, true, "bo");
BreakIterator boundary = config.TROMBONE_WORD_BREAK_ITERATOR;
String text = "ཆུ་ཡོད་མ་རེད། ཁང་པ་བརྗེ་བོ་བརྒྱབ་དང་ལབ་ཀྱི་འདུག་ར། ཁང་པ་བརྗེ་བོ་བརྒྱབ་ན་ང་ཚོ་འདིའི་རྒྱབ་ལོགས་འདི་ལ། ཁང་པ་ཉི་མ་ཁ་ཤས་ཤིག་ལ་མི་སླེབས་པ་ཡོད་ལབ་ཡིན་པ one་two";
boundary.setText(text);
int start = boundary.first();
for (int end = boundary.next();
end != BreakIterator.DONE;
start = end, end = boundary.next()) {
System.out.println(text.substring(start,end));
}
}
}