SnowballAnalyzerBuilder.java example

Explorer
stratio-cassandra-master
/*
 * Copyright 2015, Stratio.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.stratio.cassandra.index.schema.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ca.CatalanAnalyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.da.DanishAnalyzer;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.apache.lucene.analysis.eu.BasqueAnalyzer;
import org.apache.lucene.analysis.fi.FinnishAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.ga.IrishAnalyzer;
import org.apache.lucene.analysis.hu.HungarianAnalyzer;
import org.apache.lucene.analysis.it.ItalianAnalyzer;
import org.apache.lucene.analysis.nl.DutchAnalyzer;
import org.apache.lucene.analysis.no.NorwegianAnalyzer;
import org.apache.lucene.analysis.pt.PortugueseAnalyzer;
import org.apache.lucene.analysis.ro.RomanianAnalyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.sv.SwedishAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.codehaus.jackson.annotate.JsonCreator;
import org.codehaus.jackson.annotate.JsonProperty;

import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

/**
 * {@link AnalyzerBuilder} for tartarus.org snowball {@link Analyzer}.
 * <p/>
 * The supported languages are English, French, Spanish, Portuguese, Italian, Romanian, German, Dutch, Swedish,
 * Norwegian, Danish, Russian, Finnish, Irish, Hungarian, Turkish, Armenian, Basque and Catalan.
 *
 * @author Andres de la Pena <adelapena@stratio.com>
 */
public class SnowballAnalyzerBuilder extends AnalyzerBuilder {

    private final Analyzer analyzer;

    /**
     * Builds a new {@link SnowballAnalyzerBuilder} for the specified language and stopwords.
     *
     * @param language  The language. The supported languages are English, French, Spanish, Portuguese, Italian,
     *                  Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish, Irish, Hungarian,
     *                  Turkish, Armenian, Basque and Catalan.
     * @param stopwords The comma separated stopwords {@code String}.
     */
    @JsonCreator
    public SnowballAnalyzerBuilder(@JsonProperty("language") final String language,
                                   @JsonProperty("stopwords") String stopwords) {

        // Check language
        if (language == null || language.trim().isEmpty()) {
            throw new IllegalArgumentException("Language must be specified");
        }

        // Setup stopwords
        CharArraySet stops = stopwords == null ? getDefaultStopwords(language) : getStopwords(stopwords);

        // Setup analyzer
        this.analyzer = buildAnalyzer(language, stops);

        // Force analysis validation
        AnalysisUtils.analyzeAsText("test", analyzer);
    }

    /** {@inheritDoc} */
    @Override
    public Analyzer analyzer() {
        return analyzer;
    }

    /**
     * Returns the snowball {@link Analyzer} for the specified language and stopwords.
     *
     * @param language  The language code. The supported languages are English, French, Spanish, Portuguese, Italian,
     *                  Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish, Irish, Hungarian,
     *                  Turkish, Armenian, Basque and Catalan.
     * @param stopwords The stop words.
     * @return The snowball {@link Analyzer} for the specified language and stopwords.
     */
    private static Analyzer buildAnalyzer(final String language, final CharArraySet stopwords) {
        return new Analyzer() {
            protected TokenStreamComponents createComponents(String field, Reader reader) {
                final Tokenizer source = new StandardTokenizer(reader);
                TokenStream result = new StandardFilter(source);
                result = new LowerCaseFilter(result);
                result = new StopFilter(result, stopwords);
                result = new SnowballFilter(result, language);
                return new TokenStreamComponents(source, result);
            }
        };
    }

    /**
     * Returns the stopwords {@link CharArraySet} for the specified comma separated stopwords {@code String}.
     *
     * @param stopwords A {@code String} comma separated stopwords list.
     * @return The stopwords {@link CharArraySet} for the specified comma separated stopwords {@code String}.
     */
    private static CharArraySet getStopwords(String stopwords) {
        List<String> stopwordsList = new ArrayList<>();
        for (String stop : stopwords.split(",")) {
            stopwordsList.add(stop.trim());
        }
        return new CharArraySet(stopwordsList, true);
    }

    /**
     * Returns the default stopwords set used by Lucene language analyzer for the specified language.
     *
     * @param language The language for which the stopwords are. The supported languages are English, French, Spanish,
     *                 Portuguese, Italian, Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish,
     *                 Irish, Hungarian, Turkish, Armenian, Basque and Catalan.
     * @return The default stopwords set used by Lucene language analyzers.
     */
    private static CharArraySet getDefaultStopwords(String language) {
        switch (language) {
            case "English":
                return EnglishAnalyzer.getDefaultStopSet();
            case "French":
                return FrenchAnalyzer.getDefaultStopSet();
            case "Spanish":
                return SpanishAnalyzer.getDefaultStopSet();
            case "Portuguese":
                return PortugueseAnalyzer.getDefaultStopSet();
            case "Italian":
                return ItalianAnalyzer.getDefaultStopSet();
            case "Romanian":
                return RomanianAnalyzer.getDefaultStopSet();
            case "German":
                return GermanAnalyzer.getDefaultStopSet();
            case "Dutch":
                return DutchAnalyzer.getDefaultStopSet();
            case "Swedish":
                return SwedishAnalyzer.getDefaultStopSet();
            case "Norwegian":
                return NorwegianAnalyzer.getDefaultStopSet();
            case "Danish":
                return DanishAnalyzer.getDefaultStopSet();
            case "Russian":
                return RussianAnalyzer.getDefaultStopSet();
            case "Finnish":
                return FinnishAnalyzer.getDefaultStopSet();
            case "Irish":
                return IrishAnalyzer.getDefaultStopSet();
            case "Hungarian":
                return HungarianAnalyzer.getDefaultStopSet();
            case "Turkish":
                return SpanishAnalyzer.getDefaultStopSet();
            case "Armenian":
                return SpanishAnalyzer.getDefaultStopSet();
            case "Basque":
                return BasqueAnalyzer.getDefaultStopSet();
            case "Catalan":
                return CatalanAnalyzer.getDefaultStopSet();
            default:
                return CharArraySet.EMPTY_SET;
        }
    }
}