package org.wikibrain.lucene;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.queries.ChainedFilter;
import org.apache.lucene.queries.mlt.MoreLikeThis;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.flexible.standard.QueryParserUtil;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.Query;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.lang.Language;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* This class provides various utilities for building different types of queries.
*
* @author Yulun Li
* @author Ari Weiland
*
*/
public class QueryBuilder {
private static final Logger LOG = LoggerFactory.getLogger(QueryBuilder.class);
public static final int DEFAULT_MAX_PERCENTAGE = 5;
public static final int DEFAULT_MAX_QUERY_TERMS = 20;
public static final int DEFAULT_MIN_TERM_FREQ = 2;
public static final int DEFAULT_MIN_DOC_FREQ = 5;
public static final int DEFAULT_HIT_COUNT = 1000;
private final Language language;
private final LuceneSearcher searcher;
private final List<Filter> filters = new ArrayList<Filter>();
private Query query = null;
private int numHits = DEFAULT_HIT_COUNT;
// For more like this queries
private int maxPercentage = DEFAULT_MAX_PERCENTAGE;
private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
// If true, lookup upwikipedia ids for lucene ids.
private boolean resolveWikipediaIds = true;
public QueryBuilder(LuceneSearcher searcher, Language language) {
this.searcher = searcher;
this.language = language;
}
/**
* Builds a phrase query over the default text field in LuceneOptions.
*
* @param searchString
* @return
*/
public QueryBuilder setPhraseQuery(String searchString) {
return setPhraseQuery(searcher.getOptions().elements, searchString);
}
/**
* Builds a phrase query over the text field specified by elements.
*
* @param elements specifies the text field in which to search
* @param searchString
* @return
*/
public QueryBuilder setPhraseQuery(TextFieldElements elements, String searchString) {
return setPhraseQuery(elements.getTextFieldName(), searchString);
}
/**
* Builds a phrase query over the specified field.
*
* @param fieldName the name of the field on which to search
* @param searchString
* @return
*/
public QueryBuilder setPhraseQuery(String fieldName, String searchString) {
QueryParser parser = new QueryParser(
searcher.getOptions().matchVersion,
fieldName,
searcher.getAnalyzerByLanguage(language));
try {
searchString = QueryParserUtil.escape(searchString);
// Lucene doesn't escape forward slash, but it needs to
searchString = StringUtils.replace(searchString, "/", "\\/");
query = parser.parse(searchString);
return this;
} catch (ParseException e) {
throw new RuntimeException(e); // should never happen after escaping
}
}
/**
* Builds a MoreLikeThis query for the specified luceneId over the
* default text field in LuceneOptions.
*
* @param luceneId
* @return
* @throws DaoException
*/
public QueryBuilder setMoreLikeThisQuery(int luceneId) throws DaoException {
return setMoreLikeThisQuery(
searcher.getOptions().elements,
luceneId);
}
/**
* Builds a MoreLikeThis query for the specified luceneId over the
* text field specified by the TextFieldElements.
*
* @param elements
* @param luceneId
* @return
* @throws DaoException
*/
public QueryBuilder setMoreLikeThisQuery(TextFieldElements elements, int luceneId) throws DaoException {
return setMoreLikeThisQuery(elements.getTextFieldName(), luceneId);
}
/**
* Builds a MoreLikeThis query for the specified luceneId over the
* specified text field.
*
* @param fieldName
* @param luceneId
* @return
* @throws DaoException
*/
public QueryBuilder setMoreLikeThisQuery(String fieldName, int luceneId) throws DaoException {
if (luceneId >= 0) {
try {
MoreLikeThis mlt = new MoreLikeThis(searcher.getReaderByLanguage(language));
mlt.setMaxDocFreqPct(maxPercentage);
mlt.setMaxQueryTerms(maxQueryTerms);
mlt.setMinDocFreq(minDocFreq);
mlt.setMinTermFreq(minTermFreq);
mlt.setAnalyzer(searcher.getAnalyzerByLanguage(language));
mlt.setFieldNames(new String[]{ fieldName });
query = mlt.like(luceneId);
} catch (IOException e) {
LOG.warn("Can't more like this query for luceneId: " + luceneId);
}
} else {
throw new IllegalArgumentException("Illegal Lucene ID: " + luceneId);
}
return this;
}
public boolean hasQuery() {
return query != null;
}
public void setResolveWikipediaIds(boolean resolve) {
this.resolveWikipediaIds = resolve;
}
public WikiBrainScoreDoc[] search() {
if (!hasQuery()) {
throw new IllegalArgumentException("no query specified. call one of the QueryBuilder.set* methods to specify a query");
}
return searcher.search(query, language, numHits, getFilters(), resolveWikipediaIds);
}
/**
* Adds a filter to the chain of filters. DOES NOT remove existing filters.
*/
public void addFilter(Filter filter) {
this.filters.add(filter);
}
public Filter getFilters() {
if (filters.isEmpty()) {
return null;
} else if (filters.size() == 1) {
return filters.get(0);
} else {
return new ChainedFilter((Filter[]) filters.toArray());
}
}
public int getNumHits() {
return numHits;
}
public QueryBuilder setNumHits(int hits) {
this.numHits = hits;
return this;
}
public int getMaxPercentage() {
return maxPercentage;
}
public void setMaxPercentage(int maxPercentage) {
this.maxPercentage = maxPercentage;
}
public int getMaxQueryTerms() {
return maxQueryTerms;
}
public void setMaxQueryTerms(int maxQueryTerms) {
this.maxQueryTerms = maxQueryTerms;
}
public int getMinTermFreq() {
return minTermFreq;
}
public void setMinTermFreq(int minTermFreq) {
this.minTermFreq = minTermFreq;
}
public int getMinDocFreq() {
return minDocFreq;
}
public void setMinDocFreq(int minDocFreq) {
this.minDocFreq = minDocFreq;
}
}