/* * Copyright 2015, Stratio. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.stratio.cassandra.index.schema.analysis; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ca.CatalanAnalyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.da.DanishAnalyzer; import org.apache.lucene.analysis.de.GermanAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.es.SpanishAnalyzer; import org.apache.lucene.analysis.eu.BasqueAnalyzer; import org.apache.lucene.analysis.fi.FinnishAnalyzer; import org.apache.lucene.analysis.fr.FrenchAnalyzer; import org.apache.lucene.analysis.ga.IrishAnalyzer; import org.apache.lucene.analysis.hu.HungarianAnalyzer; import org.apache.lucene.analysis.it.ItalianAnalyzer; import org.apache.lucene.analysis.nl.DutchAnalyzer; import org.apache.lucene.analysis.no.NorwegianAnalyzer; import org.apache.lucene.analysis.pt.PortugueseAnalyzer; import org.apache.lucene.analysis.ro.RomanianAnalyzer; import org.apache.lucene.analysis.ru.RussianAnalyzer; import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.sv.SwedishAnalyzer; import org.apache.lucene.analysis.util.CharArraySet; import org.codehaus.jackson.annotate.JsonCreator; import org.codehaus.jackson.annotate.JsonProperty; import java.io.Reader; import java.util.ArrayList; import java.util.List; /** * {@link AnalyzerBuilder} for tartarus.org snowball {@link Analyzer}. * <p/> * The supported languages are English, French, Spanish, Portuguese, Italian, Romanian, German, Dutch, Swedish, * Norwegian, Danish, Russian, Finnish, Irish, Hungarian, Turkish, Armenian, Basque and Catalan. * * @author Andres de la Pena <adelapena@stratio.com> */ public class SnowballAnalyzerBuilder extends AnalyzerBuilder { private final Analyzer analyzer; /** * Builds a new {@link SnowballAnalyzerBuilder} for the specified language and stopwords. * * @param language The language. The supported languages are English, French, Spanish, Portuguese, Italian, * Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish, Irish, Hungarian, * Turkish, Armenian, Basque and Catalan. * @param stopwords The comma separated stopwords {@code String}. */ @JsonCreator public SnowballAnalyzerBuilder(@JsonProperty("language") final String language, @JsonProperty("stopwords") String stopwords) { // Check language if (language == null || language.trim().isEmpty()) { throw new IllegalArgumentException("Language must be specified"); } // Setup stopwords CharArraySet stops = stopwords == null ? getDefaultStopwords(language) : getStopwords(stopwords); // Setup analyzer this.analyzer = buildAnalyzer(language, stops); // Force analysis validation AnalysisUtils.analyzeAsText("test", analyzer); } /** {@inheritDoc} */ @Override public Analyzer analyzer() { return analyzer; } /** * Returns the snowball {@link Analyzer} for the specified language and stopwords. * * @param language The language code. The supported languages are English, French, Spanish, Portuguese, Italian, * Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish, Irish, Hungarian, * Turkish, Armenian, Basque and Catalan. * @param stopwords The stop words. * @return The snowball {@link Analyzer} for the specified language and stopwords. */ private static Analyzer buildAnalyzer(final String language, final CharArraySet stopwords) { return new Analyzer() { protected TokenStreamComponents createComponents(String field, Reader reader) { final Tokenizer source = new StandardTokenizer(reader); TokenStream result = new StandardFilter(source); result = new LowerCaseFilter(result); result = new StopFilter(result, stopwords); result = new SnowballFilter(result, language); return new TokenStreamComponents(source, result); } }; } /** * Returns the stopwords {@link CharArraySet} for the specified comma separated stopwords {@code String}. * * @param stopwords A {@code String} comma separated stopwords list. * @return The stopwords {@link CharArraySet} for the specified comma separated stopwords {@code String}. */ private static CharArraySet getStopwords(String stopwords) { List<String> stopwordsList = new ArrayList<>(); for (String stop : stopwords.split(",")) { stopwordsList.add(stop.trim()); } return new CharArraySet(stopwordsList, true); } /** * Returns the default stopwords set used by Lucene language analyzer for the specified language. * * @param language The language for which the stopwords are. The supported languages are English, French, Spanish, * Portuguese, Italian, Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish, * Irish, Hungarian, Turkish, Armenian, Basque and Catalan. * @return The default stopwords set used by Lucene language analyzers. */ private static CharArraySet getDefaultStopwords(String language) { switch (language) { case "English": return EnglishAnalyzer.getDefaultStopSet(); case "French": return FrenchAnalyzer.getDefaultStopSet(); case "Spanish": return SpanishAnalyzer.getDefaultStopSet(); case "Portuguese": return PortugueseAnalyzer.getDefaultStopSet(); case "Italian": return ItalianAnalyzer.getDefaultStopSet(); case "Romanian": return RomanianAnalyzer.getDefaultStopSet(); case "German": return GermanAnalyzer.getDefaultStopSet(); case "Dutch": return DutchAnalyzer.getDefaultStopSet(); case "Swedish": return SwedishAnalyzer.getDefaultStopSet(); case "Norwegian": return NorwegianAnalyzer.getDefaultStopSet(); case "Danish": return DanishAnalyzer.getDefaultStopSet(); case "Russian": return RussianAnalyzer.getDefaultStopSet(); case "Finnish": return FinnishAnalyzer.getDefaultStopSet(); case "Irish": return IrishAnalyzer.getDefaultStopSet(); case "Hungarian": return HungarianAnalyzer.getDefaultStopSet(); case "Turkish": return SpanishAnalyzer.getDefaultStopSet(); case "Armenian": return SpanishAnalyzer.getDefaultStopSet(); case "Basque": return BasqueAnalyzer.getDefaultStopSet(); case "Catalan": return CatalanAnalyzer.getDefaultStopSet(); default: return CharArraySet.EMPTY_SET; } } }