package no.priv.garshol.duke.databases;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import no.priv.garshol.duke.Comparator;
import no.priv.garshol.duke.Configuration;
import no.priv.garshol.duke.Database;
import no.priv.garshol.duke.DukeConfigException;
import no.priv.garshol.duke.DukeException;
import no.priv.garshol.duke.Property;
import no.priv.garshol.duke.Record;
import no.priv.garshol.duke.comparators.GeopositionComparator;
import no.priv.garshol.duke.utils.Utils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexNotFoundException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
/**
* Represents the Lucene index, and implements record linkage services
* on top of it.
*/
public class LuceneDatabase implements Database {
private Configuration config;
private EstimateResultTracker maintracker;
private IndexWriter iwriter;
private Directory directory;
private IndexReader reader;
private IndexSearcher searcher;
private Analyzer analyzer;
// Deichman case:
// 1 = 40 minutes
// 4 = 48 minutes
private final static int SEARCH_EXPANSION_FACTOR = 1;
private int max_search_hits;
private float min_relevance;
private boolean overwrite;
private String path;
private boolean fuzzy_search;
public BoostMode boost_mode;
// helper for geostuff
private GeoProperty geoprop;
public LuceneDatabase() {
this.analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
this.maintracker = new EstimateResultTracker();
this.max_search_hits = 1000000;
this.fuzzy_search = true; // on by default
this.boost_mode = BoostMode.QUERY;
}
public void setConfiguration(Configuration config) {
this.config = config;
}
public void setOverwrite(boolean overwrite) {
this.overwrite = overwrite;
}
public void setMaxSearchHits(int max_search_hits) {
this.max_search_hits = max_search_hits;
}
public void setMinRelevance(float min_relevance) {
this.min_relevance = min_relevance;
}
/**
* Controls whether to use fuzzy searches for properties that have
* fuzzy comparators. True by default.
*/
public void setFuzzySearch(boolean fuzzy_search) {
this.fuzzy_search = fuzzy_search;
}
/**
* Returns the path to the Lucene index directory. If null, it means
* the Lucene index is kept in-memory.
*/
public String getPath() {
return path;
}
/**
* The path to the Lucene index directory. If null or not set, it
* means the Lucene index is kept in-memory.
*/
public void setPath(String path) {
this.path = path;
}
/**
* Tells the database to boost Lucene fields when searching for
* candidate matches, depending on their probabilities. This can
* help Lucene better pick the most interesting candidates.
*/
public void setBoostMode(BoostMode boost_mode) {
this.boost_mode = boost_mode;
}
/**
* Returns true iff the Lucene index is held in memory rather than
* on disk.
*/
public boolean isInMemory() {
return (directory instanceof RAMDirectory);
}
/**
* Add the record to the index.
*/
public void index(Record record) {
if (directory == null)
init();
if (!overwrite && path != null)
delete(record);
Document doc = new Document();
for (String propname : record.getProperties()) {
Property prop = config.getPropertyByName(propname);
if (prop == null)
throw new DukeConfigException("Record has property " + propname +
" for which there is no configuration");
if (prop.getComparator() instanceof GeopositionComparator &&
geoprop != null) {
// index specially as geocoordinates
String v = record.getValue(propname);
if (v == null || v.equals(""))
continue;
// this gives us a searchable geoindexed value
for (IndexableField f : geoprop.createIndexableFields(v))
doc.add(f);
// this preserves the coordinates in readable form for display purposes
doc.add(new Field(propname, v, Field.Store.YES,
Field.Index.NOT_ANALYZED));
} else {
Field.Index ix;
if (prop.isIdProperty())
ix = Field.Index.NOT_ANALYZED; // so findRecordById will work
else // if (prop.isAnalyzedProperty())
ix = Field.Index.ANALYZED;
// FIXME: it turns out that with the StandardAnalyzer you can't have a
// multi-token value that's not analyzed if you want to find it again...
// else
// ix = Field.Index.NOT_ANALYZED;
Float boost = getBoostFactor(prop.getHighProbability(), BoostMode.INDEX);
for (String v : record.getValues(propname)) {
if (v.equals(""))
continue; // FIXME: not sure if this is necessary
Field field = new Field(propname, v, Field.Store.YES, ix);
if (boost != null)
field.setBoost(boost);
doc.add(field);
}
}
}
try {
iwriter.addDocument(doc);
} catch (IOException e) {
throw new DukeException(e);
}
}
private void delete(Record record) {
// removes previous copy of this record from the index, if it's there
Property idprop = config.getIdentityProperties().iterator().next();
Query q = parseTokens(idprop.getName(), record.getValue(idprop.getName()));
try {
iwriter.deleteDocuments(q);
} catch (IOException e) {
throw new DukeException(e);
}
}
/**
* Flushes all changes to disk.
*/
public void commit() {
if (directory == null)
return;
try {
if (reader != null)
reader.close();
// it turns out that IndexWriter.optimize actually slows
// searches down, because it invalidates the cache. therefore
// not calling it any more.
// http://www.searchworkings.org/blog/-/blogs/uwe-says%3A-is-your-reader-atomic
// iwriter.optimize();
iwriter.commit();
openSearchers();
} catch (IOException e) {
throw new DukeException(e);
}
}
/**
* Look up record by identity.
*/
public Record findRecordById(String id) {
if (directory == null)
init();
Property idprop = config.getIdentityProperties().iterator().next();
for (Record r : lookup(idprop, id))
if (r.getValue(idprop.getName()).equals(id))
return r;
return null; // not found
}
/**
* Look up potentially matching records.
*/
public Collection<Record> findCandidateMatches(Record record) {
if (directory == null)
init();
// if we have a geoprop it means that's the only way to search
if (geoprop != null) {
String value = record.getValue(geoprop.getName());
if (value != null) {
Filter filter = geoprop.geoSearch(value);
return maintracker.doQuery(new MatchAllDocsQuery(), filter);
}
}
// ok, we didn't do a geosearch, so proceed as normal.
// first we build the combined query for all lookup properties
BooleanQuery query = new BooleanQuery();
for (Property prop : config.getLookupProperties()) {
Collection<String> values = record.getValues(prop.getName());
if (values == null)
continue;
for (String value : values)
parseTokens(query, prop.getName(), value,
prop.getLookupBehaviour() == Property.Lookup.REQUIRED,
prop.getHighProbability());
}
// do the query
return maintracker.doQuery(query);
}
/**
* Stores state to disk and closes all open resources.
*/
public void close() {
if (directory == null)
return;
try {
iwriter.close();
directory.close();
if (reader != null)
reader.close();
} catch (IOException e) {
throw new DukeException(e);
}
}
public String toString() {
return "LuceneDatabase, max-search-hits: " + max_search_hits +
", min-relevance: " + min_relevance + ", fuzzy: " + fuzzy_search +
", boost-mode: " + boost_mode + ", path: " + path + "\n " + directory;
}
// ----- INTERNALS
private void init() {
try {
openIndexes(overwrite);
openSearchers();
initSpatial();
} catch (Exception e) {
// initialization failed, so clean up to prevent leaving us in an
// inconsistent state https://github.com/larsga/Duke/issues/226
directory = null;
throw new DukeException(e);
}
}
private void openIndexes(boolean overwrite) throws IOException {
if (directory == null) {
try {
if (path == null)
directory = new RAMDirectory();
else {
//directory = new MMapDirectory(new File(config.getPath()));
// as per http://wiki.apache.org/lucene-java/ImproveSearchingSpeed
// we use NIOFSDirectory, provided we're not on Windows
if (Utils.isWindowsOS())
directory = FSDirectory.open(new File(path));
else
directory = NIOFSDirectory.open(new File(path));
}
IndexWriterConfig cfg =
new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer);
cfg.setOpenMode(overwrite ? IndexWriterConfig.OpenMode.CREATE :
IndexWriterConfig.OpenMode.APPEND);
iwriter = new IndexWriter(directory, cfg);
iwriter.commit(); // so that the searcher doesn't fail
} catch (IndexNotFoundException e) {
if (!overwrite) {
// the index was not there, so make a new one
directory = null; // ensure we really do try again
openIndexes(true);
} else
throw new DukeException(e);
}
}
}
public void openSearchers() throws IOException {
reader = DirectoryReader.open(directory);
searcher = new IndexSearcher(reader);
}
/**
* Parses the query. Using this instead of a QueryParser in order
* to avoid thread-safety issues with Lucene's query parser.
*
* @param fieldName the name of the field
* @param value the value of the field
* @return the parsed query
*/
private Query parseTokens(String fieldName, String value) {
BooleanQuery searchQuery = new BooleanQuery();
if (value != null) {
Analyzer analyzer = new KeywordAnalyzer();
try {
TokenStream tokenStream =
analyzer.tokenStream(fieldName, new StringReader(value));
tokenStream.reset();
CharTermAttribute attr =
tokenStream.getAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
String term = attr.toString();
Query termQuery = new TermQuery(new Term(fieldName, term));
searchQuery.add(termQuery, Occur.SHOULD);
}
} catch (IOException e) {
throw new DukeException("Error parsing input string '" + value + "' " +
"in field " + fieldName);
}
}
return searchQuery;
}
/**
* Parses Lucene query.
* @param required Iff true, return only records matching this value.
*/
private void parseTokens(BooleanQuery parent, String fieldName,
String value, boolean required, double probability) {
value = escapeLucene(value);
if (value.length() == 0)
return;
try {
TokenStream tokenStream =
analyzer.tokenStream(fieldName, new StringReader(value));
tokenStream.reset();
CharTermAttribute attr =
tokenStream.getAttribute(CharTermAttribute.class);
Float boost = getBoostFactor(probability, BoostMode.QUERY);
while (tokenStream.incrementToken()) {
String term = attr.toString();
Query termQuery;
if (fuzzy_search && isFuzzy(fieldName))
termQuery = new FuzzyQuery(new Term(fieldName, term));
else
termQuery = new TermQuery(new Term(fieldName, term));
if (boost != null)
termQuery.setBoost(boost);
parent.add(termQuery, required ? Occur.MUST : Occur.SHOULD);
}
} catch (IOException e) {
throw new DukeException("Error parsing input string '"+value+"' "+
"in field " + fieldName);
}
}
private boolean isFuzzy(String fieldName) {
Comparator c = config.getPropertyByName(fieldName).getComparator();
return c != null && c.isTokenized();
}
private String escapeLucene(String query) {
char[] tmp = new char[query.length() * 2];
int count = 0;
for (int ix = 0; ix < query.length(); ix++) {
char ch = query.charAt(ix);
if (ch == '*' || ch == '?' || ch == '!' || ch == '&' || ch == '(' ||
ch == ')' || ch == '-' || ch == '+' || ch == ':' || ch == '"' ||
ch == '[' || ch == ']' || ch == '~' || ch == '{' || ch == '}' ||
ch == '^' || ch == '|')
tmp[count++] = '\\'; // these characters must be escaped
tmp[count++] = ch;
}
return new String(tmp, 0, count).trim();
}
public Collection<Record> lookup(Property property, String value) {
Query query = parseTokens(property.getName(), value);
return maintracker.doQuery(query);
}
/**
* The tracker is used to estimate the size of the query result
* we should ask Lucene for. This parameter is the single biggest
* influence on search performance, but setting it too low causes
* matches to be missed. We therefore try hard to estimate it as
* correctly as possible.
*
* The tracker uses a ring buffer of recent result sizes to
* estimate the result size.
*/
class EstimateResultTracker {
private int limit;
/**
* Ring buffer containing n last search result sizes, except for
* searches which found nothing.
*/
private int[] prevsizes;
private int sizeix; // position in prevsizes
public EstimateResultTracker() {
this.limit = 100;
this.prevsizes = new int[10];
}
public Collection<Record> doQuery(Query query) {
return doQuery(query, null);
}
public Collection<Record> doQuery(Query query, Filter filter) {
List<Record> matches;
try {
ScoreDoc[] hits;
int thislimit = Math.min(limit, max_search_hits);
while (true) {
hits = searcher.search(query, filter, thislimit).scoreDocs;
if (hits.length < thislimit || thislimit == max_search_hits)
break;
thislimit = thislimit * 5;
}
matches = new ArrayList(Math.min(hits.length, max_search_hits));
for (int ix = 0; ix < hits.length &&
hits[ix].score >= min_relevance; ix++)
matches.add(new DocumentRecord(hits[ix].doc,
searcher.doc(hits[ix].doc)));
if (hits.length > 0) {
synchronized(this) {
prevsizes[sizeix++] = matches.size();
if (sizeix == prevsizes.length) {
sizeix = 0;
limit = Math.max((int) (average() * SEARCH_EXPANSION_FACTOR), limit);
}
}
}
} catch (IOException e) {
throw new DukeException(e);
}
return matches;
}
private double average() {
int sum = 0;
int ix = 0;
for (; ix < prevsizes.length && prevsizes[ix] != 0; ix++)
sum += prevsizes[ix];
return sum / (double) ix;
}
}
/**
* Checks to see if we need the spatial support, and if so creates
* the necessary context objects.
*/
private void initSpatial() {
// FIXME: for now, we only use geosearch if that's the only way to
// find suitable records, since we don't know how to combine
// geosearch ranking with normal search ranking.
if (config.getLookupProperties().size() != 1)
return;
Property prop = config.getLookupProperties().iterator().next();
if (!(prop.getComparator() instanceof GeopositionComparator))
return;
geoprop = new GeoProperty(prop);
}
public enum BoostMode {
/**
* Boost fields at query time.
*/
QUERY,
/**
* Boost fields at index time. This means records must be
* reindexed to change the boosting.
*/
INDEX,
/**
* Don't boost fields.
*/
NONE;
}
private Float getBoostFactor(double probability, BoostMode phase) {
Float boost = null;
if (phase == boost_mode) {
double p = Math.min(0.99, probability); // don't divide by zero
boost = (float) Math.sqrt(1.0 / ((1.0 - p) * 2.0));
}
return boost;
}
}