/** * */ package org.voyanttools.trombone.lucene.search; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang3.StringUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.RegexpQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanNotQuery; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; /** * @author sgs * */ public class FieldPrefixAwareSimpleSpanQueryParser extends FieldPrefixAwareSimpleQueryParser { /** * @param analyzer * @param weights */ public FieldPrefixAwareSimpleSpanQueryParser(IndexReader indexReader, Analyzer analyzer) { super(indexReader, analyzer); } /** * @param analyzer * @param weights */ public FieldPrefixAwareSimpleSpanQueryParser(IndexReader indexReader, Analyzer analyzer, String defaultPrefix) { super(indexReader, analyzer, defaultPrefix); } @Override public Query parse(String queryText) { Query query = super.parse(queryText); if (query instanceof SpanQuery) { if (query instanceof SpanNearQuery) { SpanQuery[] spanQueries = ((SpanNearQuery) query).getClauses(); if (spanQueries.length==1) {return spanQueries[0];} } if (query instanceof SpanOrQuery) { SpanQuery[] spanQueries = ((SpanOrQuery) query).getClauses(); if (spanQueries.length==1) {return spanQueries[0];} } return query; } else if (query instanceof BooleanQuery) { return convertBooleanQuerytoSpanQuery((BooleanQuery) query, queryText); } else { throw new IllegalStateException("Cannot convert to SpanQuery: "+query); } } private SpanQuery convertBooleanQuerytoSpanQuery(BooleanQuery query, String queryText) { List<SpanQuery> spanQueries = new ArrayList<SpanQuery>(); List<SpanQuery> notQueries = new ArrayList<SpanQuery>(); boolean hasMatchAllDocs = false; for (BooleanClause bq : ((BooleanQuery) query).clauses()) { Query q = bq.getQuery(); if (q instanceof SpanQuery) { if (((Object) q instanceof SpanOrQuery || (Object) q instanceof SpanTermQuery) && bq.getOccur()==BooleanClause.Occur.MUST_NOT) { notQueries.add((SpanQuery) q); } else { spanQueries.add((SpanQuery) bq.getQuery()); } } else if (q instanceof MatchAllDocsQuery) { hasMatchAllDocs = true; } else if (q instanceof BooleanQuery) { SpanQuery sq = convertBooleanQuerytoSpanQuery((BooleanQuery) q, queryText); spanQueries.add(convertBooleanQuerytoSpanQuery((BooleanQuery) q, queryText)); } else { throw new IllegalArgumentException("Unable to parse query: "+queryText+", unanticipated query type: "+q.getClass().getName()); } } SpanQuery combined = new SpanOrQuery(spanQueries.toArray(new SpanQuery[spanQueries.size()])); if (notQueries.isEmpty()) { List<SpanQuery> ors = new ArrayList<SpanQuery>(); List<SpanNotQuery> nots = new ArrayList<SpanNotQuery>(); for (SpanQuery q : ((SpanOrQuery) combined).getClauses()) { if (q instanceof SpanNotQuery) {nots.add((SpanNotQuery) q);} else {ors.add(q);} } if (nots.isEmpty()) {return combined;} else { if (ors.size()==1) { // FIXME: this only uses one not return new SpanNotQuery(ors.get(0), nots.get(0).getExclude()); } else { // FIXME: this only uses one not return new SpanNotQuery(new SpanOrQuery(ors.toArray(new SpanQuery[ors.size()])), nots.get(0).getExclude()); } } } else { return new SpanNotQuery(combined, notQueries.get(0)); } } public Map<String, SpanQuery> getSpanQueriesMap(String[] queries, boolean isQueryExpand) throws IOException { Map<String, SpanQuery> map = new HashMap<String, SpanQuery>(); for (String query : queries) { if (query.trim().isEmpty()) {continue;} boolean isReallyQueryExpand = isQueryExpand; if (query.startsWith("^")) { isReallyQueryExpand = true; query = query.substring(1); } Query q = parse(query); if (isReallyQueryExpand && q instanceof SpanTermQuery == false) { if (q instanceof SpanOrQuery) { IndexSearcher searcher = new IndexSearcher(reader); int count = 0; for (SpanQuery spanQuery : ((SpanOrQuery) q).getClauses()) { // we need to double-check that this term is in the corpus (the query rewrite method includes all terms) if (searcher.search(spanQuery, 1).totalHits==1) { map.put(spanQuery.toString(defaultPrefix), spanQuery); count++; } } if (count==0) { map.put(query, (SpanOrQuery) q); } } } else { if (q instanceof SpanOrQuery) { SpanOrQuery orq = (SpanOrQuery) q; if (((SpanOrQuery) q).getClauses().length>0) { // check if it looks like an and query: +this +that if (StringUtils.countMatches(query,"+") == orq.getClauses().length) { // create an AND query by having a huge slop TODO: is this an important performance hit? q = new SpanNearQuery(orq.getClauses(), Integer.MAX_VALUE, false); } // check to see if we have a bare phrase (no quotes, no or operator but still SpanOr) else if (query.indexOf(" ")>-1 && query.indexOf("|")==-1 && query.indexOf("\"")==-1) { q = new SpanNearQuery(orq.getClauses(), 0, true); query = "\""+query+"\""; } } } map.put(query, (SpanQuery) q); } } return map; } @Override protected Query newDefaultQuery(String text) { Query query = super.newDefaultQuery(text); if (query instanceof BooleanQuery) { List<SpanQuery> spanQueries = new ArrayList<SpanQuery>(); for (BooleanClause bq : ((BooleanQuery) query).clauses()) { spanQueries.add(getSpanTermQuery(bq.getQuery())); } return new SpanOrQuery(spanQueries.toArray(new SpanQuery[spanQueries.size()])); } else { return getSpanTermQuery(query); } } private SpanQuery getSpanTermQuery(Query query) { if (query instanceof TermQuery) { return new SpanTermQuery(((TermQuery) query).getTerm()); } else if (query instanceof SpanQuery) { return (SpanQuery) query; } else { throw new IllegalStateException("Unexpected query type: "+query.getClass().getCanonicalName()); } } @Override protected Query newFuzzyQuery(String text, int fuzziness) { Query query = super.newFuzzyQuery(text, fuzziness); if (query instanceof BooleanQuery) { List<SpanQuery> spanQueries = new ArrayList<SpanQuery>(); for (BooleanClause bq : ((BooleanQuery) query).clauses()) { SpanQuery spanQuery = new SpanMultiTermQueryWrapper<FuzzyQuery>((FuzzyQuery) bq.getQuery()); spanQueries.add(spanQuery); } return new SpanOrQuery(spanQueries.toArray(new SpanQuery[spanQueries.size()])); } else { return new SpanMultiTermQueryWrapper<FuzzyQuery>((FuzzyQuery) query); } } @Override protected Query newPhraseQuery(String text, int slop) { Query query = super.newPhraseQuery(text, slop); if (query instanceof BooleanQuery) { List<SpanQuery> spanQueries = new ArrayList<SpanQuery>(); for (BooleanClause bq : ((BooleanQuery) query).clauses()) { SpanQuery spanQuery = getSpanNearQuery((PhraseQuery) bq.getQuery()); spanQueries.add(spanQuery); } return new SpanOrQuery(spanQueries.toArray(new SpanQuery[spanQueries.size()])); } else { return getSpanNearQuery((PhraseQuery) query); } } private SpanQuery getSpanNearQuery(PhraseQuery query) { Term[] terms = query.getTerms(); SpanQuery[] queries = new SpanQuery[terms.length]; for (int i=0, len=terms.length; i<len; i++) { queries[i] = new SpanTermQuery(terms[i]); } return new SpanNearQuery(queries, query.getSlop(), false); } @Override protected Query newPrefixQuery(String text) { Query query = super.newPrefixQuery(text); if (query instanceof BooleanQuery) { List<SpanQuery> spanQueries = new ArrayList<SpanQuery>(); for (BooleanClause bq : ((BooleanQuery) query).clauses()) { SpanQuery spanQuery = getQuery((PrefixQuery) bq.getQuery()); spanQueries.add(spanQuery); } return new SpanOrQuery(spanQueries.toArray(new SpanQuery[spanQueries.size()])); } else if (query instanceof SpanOrQuery) { return query; } else { return getQuery((PrefixQuery) query); } } private SpanQuery getQuery(PrefixQuery query) { try { return (SpanQuery) new SpanMultiTermQueryWrapper<PrefixQuery>((PrefixQuery) query).rewrite(reader); } catch (IOException e) { throw new IllegalStateException("Unable to expand queries from Lucene index for query: "+query.toString()); } } protected Query newRegexQuery(String text) throws IOException { Query query = super.newRegexQuery(text); if (query instanceof BooleanQuery) { List<SpanQuery> spanQueries = new ArrayList<SpanQuery>(); for (BooleanClause bq : ((BooleanQuery) query).clauses()) { SpanQuery spanQuery = new SpanMultiTermQueryWrapper<RegexpQuery>((RegexpQuery) bq.getQuery()); spanQueries.add(spanQuery); } return new SpanOrQuery(spanQueries.toArray(new SpanQuery[spanQueries.size()])); } else { Query spanRegexQuery = new SpanMultiTermQueryWrapper<RegexpQuery>((RegexpQuery) query); return spanRegexQuery.rewrite(reader); } } protected Query newRangeQuery(String text) throws IOException { Query query = super.newRangeQuery(text); if (query instanceof BooleanQuery) { List<SpanQuery> spanQueries = new ArrayList<SpanQuery>(); for (BooleanClause bq : ((BooleanQuery) query).clauses()) { SpanQuery spanQuery = new SpanMultiTermQueryWrapper<TermRangeQuery>((TermRangeQuery) bq.getQuery()); Query q = spanQuery.rewrite(reader); spanQueries.add((SpanQuery) q); } return new SpanOrQuery(spanQueries.toArray(new SpanQuery[spanQueries.size()])); } else { Query rangeQuery = new SpanMultiTermQueryWrapper<TermRangeQuery>((TermRangeQuery) query); return rangeQuery.rewrite(reader); } } }