/******************************************************************************* * Trombone is a flexible text processing and analysis library used * primarily by Voyant Tools (voyant-tools.org). * * Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell * * This file is part of Trombone. * * Trombone is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Trombone is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Trombone. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package org.voyanttools.trombone.lucene.analysis; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.cn.smart.HMMChineseTokenizer; import org.apache.lucene.analysis.core.LetterTokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseTokenizer; import org.apache.lucene.analysis.core.UnicodeWhitespaceTokenizer; import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; import org.apache.tika.io.IOUtils; import org.voyanttools.trombone.lucene.analysis.icu.TromboneICUTokenizerConfig; import org.voyanttools.trombone.model.TokenType; import org.voyanttools.trombone.util.FlexibleParameters; /** * @author sgs * */ public class LexicalAnalyzer extends Analyzer { protected FlexibleParameters parameters = new FlexibleParameters(); protected String lang = ""; @Override protected Reader initReader(String fieldName, Reader reader) { if (fieldName.equals(TokenType.lexical.name())) { reader = initReader(reader); } else { parameters.clear(); } try { return new HTMLCharFilter(reader); } catch (IOException e) { throw new RuntimeException(e); } } protected Reader initReader(Reader reader) { /* since there doesn't seem to be a way of passing parameters to the * analyzer that's content-aware and per-field, we can add some * instructions to the end of the reader (this is done by * {@link LuceneIndexer}). At this end we're especially interested * in determining the language and if a parameter was set to use * a simple word-boundary tokenizer (for some Asian languages * the tokenizer is too aggressive and we want to allow the user * to do segmentation. */ String text; try { text = IOUtils.toString(reader); } catch (IOException e) { throw new RuntimeException(e); } if (text.endsWith("-->") && text.contains("<!--")) { int start = text.lastIndexOf("<!--"); parameters = getParameters(text.substring(start+4, text.length()-3)); if (parameters.containsKey("language")) { lang = parameters.getParameterValue("language"); } text = text.substring(0, start); } else { parameters.clear(); } return new StringReader(text); } @Override protected TokenStreamComponents createComponents(String fieldName) { if (fieldName.equals(TokenType.lexical.name()) && parameters.getParameterValue("tokenization", "").equals("wordBoundaries")) { Tokenizer tokenizer = new LowerCaseTokenizer(); return new TokenStreamComponents(tokenizer); } else if (fieldName.equals(TokenType.lexical.name()) && parameters.getParameterValue("tokenization", "").equals("whitespace")) { Tokenizer tokenizer = new UnicodeWhitespaceTokenizer(); return new TokenStreamComponents(tokenizer); } else if (lang.startsWith("zh") && fieldName.equals(TokenType.lexical.name())) { // Chinese Tokenizer tokenizer = new HMMChineseTokenizer(); return new TokenStreamComponents(tokenizer, tokenizer); } else if (lang.equals("bo") && fieldName.equals(TokenType.lexical.name())) { // Tibetan Tokenizer tokenizer = new ICUTokenizer(new TromboneICUTokenizerConfig(true, true, lang)); TokenStream stream = new LowerCaseFilter(tokenizer); return new TokenStreamComponents(tokenizer, stream); } else { // default case Tokenizer tokenizer = new ICUTokenizer(); TokenStream stream = new LowerCaseFilter(tokenizer); return new TokenStreamComponents(tokenizer, stream); } } private FlexibleParameters getParameters(String query) { FlexibleParameters parameters = new FlexibleParameters(); String[] pairs = query.trim().split("&"); try { for (String pair : pairs) { int idx = pair.indexOf("="); parameters.addParameter(URLDecoder.decode(pair.substring(0, idx), "UTF-8"), URLDecoder.decode(pair.substring(idx + 1), "UTF-8")); } } catch (UnsupportedEncodingException e) { // should never happen throw new RuntimeException(e); } return parameters; } }