/**
*
*/
package org.voyanttools.trombone.lucene.search;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Weight;
import org.voyanttools.trombone.model.TokenType;
/**
* @author sgs
*
*/
public abstract class AbstractQueryParser {
protected final static Pattern QUERY_SEPARATOR = Pattern.compile(";");
protected final static Pattern TERM_SEPARATOR = Pattern.compile("[,\\|]");
protected final static String QUOTE = "\"";
protected final static String EMPTY = "";
protected final static String WILDCARD_ASTERISK = "*";
protected final static String WILDCARD_QUESTION = "?";
protected final static Pattern REGEX_PATTERN = Pattern.compile("(\\[.+?\\])");
protected final static Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
protected final static Pattern SLOP_PATTERN = Pattern.compile("~(\\d+)$");
protected final static String FIELD_SEPARATOR = ":";
protected final static String OPERATOR_AND = "+";
protected IndexSearcher indexSearcher;
protected IndexReader indexReader;
protected Analyzer analyzer;
/**
*
*/
public AbstractQueryParser(IndexReader indexReader, Analyzer analyzer) {
this.indexReader = indexReader;
this.indexSearcher = new IndexSearcher(indexReader); // TODO: this is probably inefficient
this.analyzer = analyzer;
}
protected Term getAnalyzedTerm(TokenType tokenType, String termString) throws IOException {
Term term = getTerm(termString, tokenType); // first ensure that we've stripped any prefixes
TokenStream tokenStream = analyzer.tokenStream(term.field(), new StringReader(term.text()));
tokenStream.reset();
CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
StringBuilder sb = new StringBuilder();
while (tokenStream.incrementToken()) {
sb.append(termAtt.toString());
}
tokenStream.end();
tokenStream.close();
return new Term(term.field(), sb.toString());
}
public Map<String, Query> getQueriesMap(String[] queries, TokenType tokenType, boolean collapse) throws IOException {
Map<String, Query> queriesMap = new HashMap<String, Query>();
// separate queries are always treated as individual (not to be collapsed)
for (String query : queries) {
// queries can also be separated by the query separator (semi-colon): one,two;three,four
for (String q : QUERY_SEPARATOR.split(query.replace(QUOTE, EMPTY).trim())) {
String qt = q.trim();
Map<String, Query> qs = getQueries(qt, tokenType, collapse);
queriesMap.putAll(qs);
}
}
return queriesMap;
}
private Map<String, Query> getQueries(String query, TokenType tokenType, boolean collapse) throws IOException {
Map<String, Query> queriesMap = new HashMap<String, Query>();
for (String termQuery : TERM_SEPARATOR.split(query)) {
termQuery = termQuery.trim();
// determine if we have a single query or a phrase (with whitespace and optional quotes)
String[] parts = WHITESPACE_PATTERN.split(termQuery);
// we have a regular term (can be a wildcard, but it's not a phrase)
if (parts.length==1) {
queriesMap.putAll(getSingleTermQueries(termQuery, tokenType, collapse));
}
// we have a phrase, let's create a Near
else {
// determine if our phrase has a trailing slop: ~\d+
int slop = 0;
Matcher slopMatcher = SLOP_PATTERN.matcher(termQuery);
if (slopMatcher.find()) {
slop = Integer.parseInt(slopMatcher.group(1));
// now remove the slop pattern before continuing
parts = WHITESPACE_PATTERN.split(termQuery.substring(0, termQuery.length()-slopMatcher.group(1).length()-1));
}
List<Query> nearQueries = new ArrayList<Query>();
for (String part : parts) {
Collection<Query> qs = getSingleTermQueries(part, tokenType, true).values();
nearQueries.addAll(qs);
}
queriesMap.put(termQuery, getNearQuery(nearQueries.toArray(new Query[0]), slop, slop==0));
}
}
// we need to build a SpanOr Query if we have multiple items and we're collapsing
if (collapse && queriesMap.size()>1) {
Query q = getBooleanQuery(queriesMap);
queriesMap.clear();
queriesMap.put(query, q);
}
return queriesMap;
}
private Map<String, Query> getSingleTermQueries(String termQuery, TokenType tokenType, boolean collapse) throws IOException {
Map<String, Query> queriesMap = new HashMap<String, Query>();
boolean isRegex = REGEX_PATTERN.matcher(termQuery).find();
if (termQuery.contains(WILDCARD_ASTERISK) || termQuery.contains(WILDCARD_QUESTION) || isRegex) {
Term term = getTerm(termQuery, tokenType);
Query query = isRegex ? getRegexQuery(term) : getWildCardQuery(term);
if (collapse) { // treat all wildcard variants as a single term
queriesMap.put(termQuery, query);
}
else { // separate each wildcard term into its own query
Set<Term> terms = new HashSet<Term>();
Weight weight = query.createWeight(indexSearcher, false);
weight.extractTerms(terms);
for (Term t : terms) {
// we don't need to analyze term here since it's already from the index
queriesMap.put(t.text(), getTermQuery(t));
}
}
}
else { // regular term (we hope)
Term term = getAnalyzedTerm(tokenType, termQuery); // analyze it first
queriesMap.put(termQuery, getTermQuery(term));
}
return queriesMap;
}
/**
* This is to ensure that we're dealing with field prefixes
* @param term this is the original term string
* @param tokenType this is the default to use if no field prefix is present
* @return a new {@link Term}
*/
protected Term getTerm(String term, TokenType tokenType) {
// strip operators
if (term.startsWith(OPERATOR_AND)) {term=term.substring(1);}
String field = tokenType.name(); // default
if (term.contains(FIELD_SEPARATOR)) {
int pos = term.indexOf(FIELD_SEPARATOR);
field = term.substring(0, pos);
term = term.substring(pos+1);
}
return new Term(field, term);
}
protected abstract Query getBooleanQuery(Map<String, Query> queriesMap) throws IOException;
protected abstract Query getNearQuery(Query[] queries, int slop, boolean inOrder);
protected abstract Query getWildCardQuery(Term term) throws IOException;
protected abstract Query getRegexQuery(Term term) throws IOException;
protected abstract Query getTermQuery(Term term);
}