/*
* Copyright (c) 2011 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.flaptor.indextank.suggest;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.flaptor.indextank.query.IndexEngineParser;
import com.flaptor.indextank.query.Query;
import com.flaptor.indextank.query.QueryNode;
import com.flaptor.indextank.query.SimplePhraseQuery;
import com.flaptor.indextank.query.TermQuery;
import com.flaptor.org.apache.lucene.util.automaton.LevenshteinAutomata;
import com.flaptor.util.Pair;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
/**
* A suggestor that uses the corpus (index) and a VP-tree to suggest queries
* based on Levenshtein distance on each term.
*/
public class DidYouMeanSuggestor {
private final IndexEngineParser parser;
private final NewPopularityIndex npi;
private final NewPopularityIndex.PopularityIndexAutomaton dictAutomaton;
public DidYouMeanSuggestor(TermSuggestor suggestor) {
Preconditions.checkNotNull(suggestor);
this.parser = new IndexEngineParser("text");
this.npi = suggestor.getPopularityIndex();
this.dictAutomaton = NewPopularityIndex.PopularityIndexAutomaton.adapt(this.npi);
}
public List<Pair<Query, String>> suggest(Query query) {
Query newQuery = query.duplicate();
String newOriginal = traverseNode(newQuery.getRoot(), newQuery.getOriginalStr());
if (newQuery.equals(query)) {
return Lists.newArrayList();
}
return Lists.newArrayList(new Pair<Query, String>(newQuery, newOriginal));
}
private String traverseNode(QueryNode node, String queryString) {
if (node instanceof TermQuery) {
TermQuery termQuery = (TermQuery)node;
String term = termQuery.getTerm();
String suggestedTerm = suggestWord(term);
if (suggestedTerm != null) {
queryString = replaceSuggestion(queryString, term, suggestedTerm);
if (queryString == null) {
return null;
}
termQuery.setTerm(suggestedTerm);
}
} else if (node instanceof SimplePhraseQuery) {
SimplePhraseQuery phraseQuery = (SimplePhraseQuery) node;
String[] termsArray = phraseQuery.getTermsArray();
for (int i = 0; i < termsArray.length; i++) {
String term = termsArray[i];
String suggestedTerm = suggestWord(term);
if (suggestedTerm != null) {
queryString = replaceSuggestion(queryString, term, suggestedTerm);
if (queryString == null) {
return null;
}
termsArray[i] = suggestedTerm;
}
}
}
Iterable<QueryNode> children = node.getChildren();
for (QueryNode queryNode : children) {
queryString = traverseNode(queryNode, queryString);
if (queryString == null) {
return null;
}
}
return queryString;
}
private String replaceSuggestion(String queryString, String term, String suggestedTerm) {
Pattern pattern = Pattern.compile("\\b(" + term + ")\\b(?!\\:)", Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(queryString);
StringBuffer sb = new StringBuffer();
if (matcher.find()) {
matcher.appendReplacement(sb, suggestedTerm);
matcher.appendTail(sb);
if (matcher.find()) {
return null;
}
} else {
return queryString;
}
return sb.toString();
}
private String suggestWord(String term) {
String bestSuggestion = null;
if (term.length() > 3) {
com.flaptor.org.apache.lucene.util.automaton.Automaton lev = new LevenshteinAutomata(term).toAutomaton(1);
LuceneAutomaton levAutomaton = LuceneAutomaton.adapt(lev);
int max = 0;
for (String suggestion: com.flaptor.indextank.suggest.Automaton.intersectPaths(dictAutomaton, levAutomaton)){
if (term.equals(suggestion)){
// don't suggest anything for words seen on the corpus
bestSuggestion = null;
break;
}
int count = this.npi.getCount("text:" + suggestion);
if (count > max) {
bestSuggestion = suggestion;
max = count;
}
}
}
return bestSuggestion;
}
}