/** * */ package org.voyanttools.trombone.lucene.search; import java.io.IOException; import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.simple.SimpleQueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.RegexpQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.voyanttools.trombone.model.TokenType; /** * @author sgs * */ public class FieldPrefixAwareSimpleQueryParser extends SimpleQueryParser { private static String PREFIX_SEPARATOR = ":"; private static Pattern RANGE_PATTERN = Pattern.compile("^\\[([\\p{L}0-9]+)-([\\p{L}0-9]+)\\]$"); private static Pattern REGEX_PATTERN = Pattern.compile("[\\[\\]\\?.]"); protected static TokenType DEFAULT_TOKENTYPE = TokenType.lexical; protected IndexReader reader; protected String defaultPrefix; // private TokenType tokenType = null; public FieldPrefixAwareSimpleQueryParser(IndexReader reader, Analyzer analyzer) { this(reader, analyzer, DEFAULT_TOKENTYPE.name()); } public FieldPrefixAwareSimpleQueryParser(IndexReader reader, Analyzer analyzer, String defaultPrefix) { super(analyzer, Collections.singletonMap(defaultPrefix, 1.0F)); this.defaultPrefix = defaultPrefix; this.reader = reader; } public FieldPrefixAwareSimpleQueryParser(IndexReader reader, Analyzer analyzer, Map<String, Float> weights) { super(analyzer, weights); } public Map<String, Query> getQueriesMap(String[] queries) { Map<String, Query> map = new HashMap<String, Query>(); for (String query : queries) { if (query.trim().isEmpty()) {continue;} map.put(query, parse(query)); } return map; } public Map<String, Query> getQueriesMap(String[] queries, boolean isQueryExpand) throws IOException { Map<String, Query> map = new HashMap<String, Query>(); for (String queryString : queries) { if (queryString.trim().isEmpty()) {continue;} boolean isReallyQueryExpand = isQueryExpand; if (queryString.startsWith("^")) { isReallyQueryExpand = true; queryString = queryString.substring(1); } Query query = parse(queryString); if (isReallyQueryExpand && query instanceof TermQuery == false) { boolean isPrefixNotQuery = query instanceof BooleanQuery && ((BooleanQuery) query).clauses().size()==2 && ((BooleanQuery) query).clauses().get(0).getQuery() instanceof PrefixQuery && ((BooleanQuery) query).clauses().get(1).getQuery() instanceof MatchAllDocsQuery; if (isPrefixNotQuery) { query = ((BooleanQuery) query).clauses().get(0).getQuery(); } if (query instanceof PrefixQuery) { // SpanMultiTermQueryWrapper's rewrite method extracts terms properly (PrefixQuery no longer does) SpanOrQuery spanOrQuery = (SpanOrQuery) new SpanMultiTermQueryWrapper<PrefixQuery>((PrefixQuery) query).rewrite(reader); for (SpanQuery sq : spanOrQuery.getClauses()) { if (isPrefixNotQuery) { BooleanQuery.Builder builder = new BooleanQuery.Builder(); builder.add(new TermQuery(((SpanTermQuery) sq).getTerm()), Occur.MUST_NOT); builder.add(new MatchAllDocsQuery(), Occur.MUST); map.put("-"+sq.toString(defaultPrefix), builder.build()); } else { map.put(sq.toString(defaultPrefix), new TermQuery(((SpanTermQuery) sq).getTerm())); } } } else if (query instanceof BooleanQuery) { for (BooleanClause bc : ((BooleanQuery) query).clauses()) { map.put(bc.getQuery().toString(defaultPrefix), bc.getQuery()); } } } else { map.put(queryString, (Query) query); } } return map; } @Override public Query parse(String queryText) { // hack to support prefixes in phrases – put the prefix within the quotes String modifiedQueryText = queryText.replaceAll("\\b(\\w+):\"","\"$1:"); // if (defaultPrefix.equals(DEFAULT_TOKENTYPE.name())==false) { // // } return super.parse(modifiedQueryText); } @Override protected Query newDefaultQuery(String text) { int pos = text.indexOf(PREFIX_SEPARATOR); String prefix = pos==-1 ? "" : text.substring(0, pos); String term = pos==-1 ? text : text.substring(pos + 1); try { if (RANGE_PATTERN.matcher(term).find()) {return newRangeQuery(text);} if (REGEX_PATTERN.matcher(term).find()) {return newRegexQuery(text);} } catch (IOException e) { throw new IllegalArgumentException("Unable to create a query from "+text, e); } return pos==-1 ? super.newDefaultQuery(text) : createBooleanQuery(prefix, term, Occur.SHOULD); } @Override protected Query newFuzzyQuery(String text, int fuzziness) { int pos = text.indexOf(PREFIX_SEPARATOR); if (pos==-1) {return super.newFuzzyQuery(text, fuzziness);} else {return new FuzzyQuery(new Term(text.substring(0, pos), text.substring(pos + 1)), fuzziness);} } @Override protected Query newPhraseQuery(String text, int slop) { int pos = text.indexOf(PREFIX_SEPARATOR); if (pos==-1) {return super.newPhraseQuery(text, slop);} else {return createPhraseQuery(text.substring(0, pos), text.substring(pos + 1), slop);} } @Override protected Query newPrefixQuery(String text) { // we got here but actually need a regex if (this.REGEX_PATTERN.matcher(text).find() && text.endsWith(":")==false) { try { return newRegexQuery(text+".*"); } catch (IOException e) { throw new IllegalArgumentException("Unable to expand query: "+text+".*"); } } int pos = text.indexOf(PREFIX_SEPARATOR); if (pos==-1) {return super.newPrefixQuery(text);} else {return new PrefixQuery(new Term(text.substring(0, pos), text.substring(pos + 1)));} } protected Query newRegexQuery(String text) throws IOException { int pos = text.indexOf(PREFIX_SEPARATOR); String prefix = pos==-1 ? "" : text.substring(0, pos); String term = pos==-1 ? text : text.substring(pos + 1); BooleanQuery.Builder builder = new BooleanQuery.Builder(); if (pos==-1) { for (Map.Entry<String,Float> entry : weights.entrySet()) { Query trq = new RegexpQuery(new Term(entry.getKey(), term)); // trq.setBoost(entry.getValue()); builder.add(trq, BooleanClause.Occur.SHOULD); } } else { builder.add(new RegexpQuery(new Term(prefix, term)), Occur.SHOULD); } return(simplify(builder.build())); } protected Query newRangeQuery(String text) throws IOException { int pos = text.indexOf(PREFIX_SEPARATOR); String prefix = pos==-1 ? "" : text.substring(0, pos); String term = pos==-1 ? text : text.substring(pos + 1); Matcher matcher = RANGE_PATTERN.matcher(term); BooleanQuery.Builder builder = new BooleanQuery.Builder(); if (matcher.find()) { if (pos==-1) { for (Map.Entry<String,Float> entry : weights.entrySet()) { Query trq = newRangeQuery(entry.getKey(), matcher); // trq.setBoost(entry.getValue()); builder.add(trq, BooleanClause.Occur.SHOULD); } } else { builder.add(newRangeQuery(prefix, matcher), Occur.SHOULD); } } return(simplify(builder.build())); } // protected Query newRangeQuery(Matcher matcher) { // BooleanQuery bq = new BooleanQuery(true); // for (Map.Entry<String,Float> entry : weights.entrySet()) { // Query trq = newRangeQuery(entry.getKey(), matcher); // trq.setBoost(entry.getValue()); // bq.add(trq, BooleanClause.Occur.SHOULD); // } // return simplify(bq); // } protected Query newRangeQuery(String field, Matcher matcher) { String start = matcher.group(1); String end = matcher.group(2); return TermRangeQuery.newStringRange(field, start, end, true, true); } }