/* * Copyright Robert Newson * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.github.rnewson.couchdb.lucene.util; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.AnalyzerWrapper; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.br.BrazilianAnalyzer; import org.apache.lucene.analysis.cjk.CJKAnalyzer; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.analysis.core.KeywordAnalyzer; import org.apache.lucene.analysis.core.LowerCaseTokenizer; import org.apache.lucene.analysis.core.SimpleAnalyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.cz.CzechAnalyzer; import org.apache.lucene.analysis.de.GermanAnalyzer; import org.apache.lucene.analysis.fr.FrenchAnalyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.nl.DutchAnalyzer; import org.apache.lucene.analysis.ru.RussianAnalyzer; import org.apache.lucene.analysis.standard.ClassicAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.th.ThaiAnalyzer; import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import java.io.Reader; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; public enum Analyzers { BRAZILIAN { @Override public Analyzer newAnalyzer(final String args) { return new BrazilianAnalyzer(); } @Override public Analyzer newAnalyzer(final JSONObject args) { return new BrazilianAnalyzer(); } }, CHINESE { @Override public Analyzer newAnalyzer(final String args) { return new SmartChineseAnalyzer(); } @Override public Analyzer newAnalyzer(final JSONObject args) { return new SmartChineseAnalyzer(); } }, CJK { @Override public Analyzer newAnalyzer(final String args) { return new CJKAnalyzer(); } @Override public Analyzer newAnalyzer(final JSONObject args) { return new CJKAnalyzer(); } }, CLASSIC { @Override public Analyzer newAnalyzer(final String args) { return new ClassicAnalyzer(); } @Override public Analyzer newAnalyzer(final JSONObject args) { return new ClassicAnalyzer(); } }, CZECH { @Override public Analyzer newAnalyzer(final String args) { return new CzechAnalyzer(); } @Override public Analyzer newAnalyzer(final JSONObject args) { return new CzechAnalyzer(); } }, DUTCH { @Override public Analyzer newAnalyzer(final String args) { return new DutchAnalyzer(); } @Override public Analyzer newAnalyzer(final JSONObject args) { return new DutchAnalyzer(); } }, ENGLISH { @Override public Analyzer newAnalyzer(final String args) { return new StandardAnalyzer(); } @Override public Analyzer newAnalyzer(final JSONObject args) { return new StandardAnalyzer(); } }, FRENCH { @Override public Analyzer newAnalyzer(final String args) { return new FrenchAnalyzer(); } @Override public Analyzer newAnalyzer(final JSONObject args) { return new FrenchAnalyzer(); } }, GERMAN { @Override public Analyzer newAnalyzer(final String args) { return new GermanAnalyzer(); } @Override public Analyzer newAnalyzer(final JSONObject args) { return new GermanAnalyzer(); } }, KEYWORD { @Override public Analyzer newAnalyzer(final String args) { return new KeywordAnalyzer(); } @Override public Analyzer newAnalyzer(final JSONObject args) { return new KeywordAnalyzer(); } }, PERFIELD { @Override public Analyzer newAnalyzer(final String args) throws JSONException { final JSONObject json = new JSONObject(args == null ? "{}" : args); return PERFIELD.newAnalyzer(json); } @Override public Analyzer newAnalyzer(final JSONObject json) throws JSONException { final Analyzer defaultAnalyzer = fromSpec(json, Constants.DEFAULT_FIELD); final Map<String, Analyzer> analyzers = new HashMap<>(); final Iterator<?> it = json.keys(); while (it.hasNext()) { final String key = it.next().toString(); if (Constants.DEFAULT_FIELD.equals(key)) continue; analyzers.put(key, fromSpec(json, key)); } return new PerFieldAnalyzerWrapper(defaultAnalyzer, analyzers); } }, RUSSIAN { @Override public Analyzer newAnalyzer(final String args) { return new RussianAnalyzer(); } @Override public Analyzer newAnalyzer(final JSONObject args) { return new RussianAnalyzer(); } }, SIMPLE { @Override public Analyzer newAnalyzer(final String args) { return new SimpleAnalyzer(); } @Override public Analyzer newAnalyzer(final JSONObject args) { return new SimpleAnalyzer(); } }, STANDARD { @Override public Analyzer newAnalyzer(final String args) { return new StandardAnalyzer(); } @Override public Analyzer newAnalyzer(final JSONObject args) { return new StandardAnalyzer(); } }, THAI { @Override public Analyzer newAnalyzer(final String args) { return new ThaiAnalyzer(); } @Override public Analyzer newAnalyzer(final JSONObject args) { return new ThaiAnalyzer(); } }, WHITESPACE { public Analyzer newAnalyzer(final String args) { return new WhitespaceAnalyzer(); } @Override public Analyzer newAnalyzer(final JSONObject args) { return new WhitespaceAnalyzer(); } }, NGRAM { public Analyzer newAnalyzer(final String args) throws JSONException { final JSONObject json = new JSONObject(args == null ? "{}" : args); return NGRAM.newAnalyzer(json); } @Override public Analyzer newAnalyzer(final JSONObject json) throws JSONException { Analyzer analyzer = fromSpec(json); int min = json.optInt("min", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); int max = json.optInt("max", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); return new NGramAnalyzer(analyzer, min, max); } }; private static final class NGramAnalyzer extends AnalyzerWrapper { private final Analyzer analyzer; private final int min; private final int max; public NGramAnalyzer(final Analyzer analyzer, final int min, final int max) { super(Analyzer.GLOBAL_REUSE_STRATEGY); this.analyzer = analyzer; this.min = min; this.max = max; } @Override protected Analyzer getWrappedAnalyzer(final String fieldName) { return analyzer; } @Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { return new TokenStreamComponents(components.getTokenizer(), new NGramTokenFilter(components.getTokenStream(), this.min, this.max)); } } public static Analyzer fromSpec(final JSONObject json, final String analyzerKey) throws JSONException { JSONObject spec = json.optJSONObject(analyzerKey); if (spec != null) { return getAnalyzer(spec); } else { return getAnalyzer(json.optString(analyzerKey, Constants.DEFAULT_ANALYZER)); } } public static Analyzer fromSpec(final JSONObject json) throws JSONException { return fromSpec(json, Constants.ANALYZER); } /* * called from DatabaseIndexer when handling an http search request */ public static Analyzer fromSpec(String str) throws JSONException { if (str == null) { return getAnalyzer(Constants.DEFAULT_ANALYZER); } if (str.startsWith("{")) { try { return getAnalyzer(new JSONObject(str)); } catch (JSONException ex) { logger.error("Analyzer spec is not well-formed json. Using default analyzer!", ex); return getAnalyzer(Constants.DEFAULT_ANALYZER); } } return getAnalyzer(str); } public static Analyzer getAnalyzer(final String str) throws JSONException { final String[] parts = str.split(":", 2); final String name = parts[0].toUpperCase(); final String args = parts.length == 2 ? parts[1] : null; return Analyzers.valueOf(name).newAnalyzer(args); } public static Analyzer getAnalyzer(final JSONObject json) throws JSONException { String className = json.optString(Constants.CLASS); JSONArray params = json.optJSONArray(Constants.PARAMS); if (className == null || className.isEmpty()) { Iterator<?> it = json.keys(); if (it.hasNext()) { String key = (String) it.next(); String args = json.optString(key); JSONObject obj = json.optJSONObject(key); if (obj != null) { return Analyzers.valueOf(key.toUpperCase()).newAnalyzer(obj); } else { return Analyzers.valueOf(key.toUpperCase()).newAnalyzer(args); } } logger.error("No analyzer class name defined in " + json); return null; } // is the class accessible? Class<?> clazz = null; try { clazz = Class.forName(className); } catch (ClassNotFoundException e) { logger.error("Analyzer class " + className + " not found. " + e.getMessage(), e); return null; } // Is the class an Analyzer? if (!Analyzer.class.isAssignableFrom(clazz)) { logger.error(clazz.getName() + " has to be a subclass of " + Analyzer.class.getName()); return null; } // Get list of parameters List<ParamSpec> paramSpecs; try { paramSpecs = getParamSpecs(params); } catch (ParameterException | JSONException ex) { logger.error("Unable to parse parameter specs for " + className + ". " + ex.getMessage(), ex); return null; } // split param specs into classes and values for constructor lookup final Class<?> paramClasses[] = new Class<?>[paramSpecs.size()]; final Object paramValues[] = new Object[paramSpecs.size()]; for (int i = 0; i < paramSpecs.size(); i++) { ParamSpec spec = paramSpecs.get(i); paramClasses[i] = spec.getValueClass(); paramValues[i] = spec.getValue(); } // Create new analyzer return newAnalyzer(clazz, paramClasses, paramValues); } /** * Create instance of the lucene analyzer with provided arguments * * @param clazz The analyzer class * @param paramClasses The parameter classes * @param paramValues The parameter values * @return The lucene analyzer */ private static Analyzer newAnalyzer(Class<?> clazz, Class<?>[] paramClasses, Object[] paramValues) { String className = clazz.getName(); try { final Constructor<?> cstr = clazz.getDeclaredConstructor(paramClasses); return (Analyzer) cstr.newInstance(paramValues); } catch (IllegalArgumentException | IllegalAccessException | InstantiationException | InvocationTargetException | SecurityException e) { logger.error("Exception while instantiating analyzer class " + className + ". " + e.getMessage(), e); } catch (NoSuchMethodException ex) { logger.error("Could not find matching analyzer class constructor for " + className + " " + ex.getMessage(), ex); } return null; } /** * Retrieve the list of parameter specs for the analyzer */ private static List<ParamSpec> getParamSpecs(JSONArray jsonParams) throws ParameterException, JSONException { final List<ParamSpec> paramSpecs = new ArrayList<>(); if (jsonParams != null) { for (int i = 0; i < jsonParams.length(); i++) { paramSpecs.add(getParamSpec(jsonParams.getJSONObject(i))); } } return paramSpecs; } /** * Parse an analyzer constructor parameter spec. * * Each param spec looks like: * * <pre>{ "name": <a name>, "type": <oneof: set, bool, int, file, string>, "value": <value> }</pre> * * The name serves to document the purpose of the parameter. Values of type <code>set</code> are JSON arrays and * are used to represent lucene CharArraySets such as for stop words in StandardAnalyzer * * @param param json object specifying an analyzer parameter * @return ParamSpec * @throws ParameterException * @throws JSONException */ private static ParamSpec getParamSpec(JSONObject param) throws ParameterException, JSONException { final String name = param.optString("name"); final String type = param.optString("type", "string"); final String value = param.optString("value"); switch (type) { // String case "string": { if (value == null) { throw new ParameterException("Value for string param: " + name + " is not empty!"); } return new ParamSpec(name, value, String.class); } // "java.io.FileReader": case "file": { if (value == null) { throw new ParameterException("The 'value' field of a file param must exist and must contain a file name."); } try { // The analyzer is responsible for closing the file Reader fileReader = new java.io.FileReader(value); return new ParamSpec(name, fileReader, Reader.class); } catch (java.io.FileNotFoundException ex) { throw new ParameterException("File " + value + " for param " + name + " not found!"); } } // "org.apache.lucene.analysis.util.CharArraySet": case "set": { JSONArray values = param.optJSONArray("value"); if (values == null) { throw new ParameterException("The 'value' field of a set param must exist and must contain a json array of strings."); } final Set<String> set = new HashSet<>(); for (int i = 0; i < values.length(); i++) { set.add(values.getString(i)); } return new ParamSpec(name, CharArraySet.copy(set), CharArraySet.class); } // "int": case "int": int n = param.optInt("value"); return new ParamSpec(name, n, int.class); // "boolean": case "boolean": boolean b = param.optBoolean("value"); return new ParamSpec(name, b, boolean.class); default: // there was no match logger.error("Unknown parameter type: " + type + " for param: " + name + " with value: " + value); break; } return null; } /** * CLass for containing the Triple : key (name), corresponding value and * class type of value. */ private static final class ParamSpec { private final String key; private final Object value; private final Class<?> clazz; @SuppressWarnings("unused") public ParamSpec(String key, Object value) { this(key, value, value.getClass()); } public ParamSpec(String key, Object value, Class<?> clazz) { this.key = key; this.value = value; this.clazz = clazz; } @SuppressWarnings("unused") public String getKey() { return key; } public Object getValue() { return value; } public Class<?> getValueClass() { return clazz; } } /** * Exception class to for reporting problems with the parameters. */ @SuppressWarnings("serial") private static class ParameterException extends Exception { public ParameterException(String message) { super(message); } } public abstract Analyzer newAnalyzer(final String args) throws JSONException; public abstract Analyzer newAnalyzer(final JSONObject args) throws JSONException; static Logger logger = Logger.getLogger(Analyzers.class.getName()); }