package no.priv.garshol.duke.databases;
import java.util.Map;
import java.util.HashMap;
import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import no.priv.garshol.duke.Record;
import no.priv.garshol.duke.Property;
import no.priv.garshol.duke.Database;
import no.priv.garshol.duke.Configuration;
import no.priv.garshol.duke.utils.StringUtils;
/**
* A database that uses a key-value store to index and find records.
* Faster than Lucene, but relevance ranking is not as good, and has
* no fuzzy or geospatial support.
* @since 1.0
*/
public class KeyValueDatabase implements Database {
private Configuration config;
private KeyValueStore store;
private int max_search_hits;
private float min_relevance;
private static final boolean DEBUG = false;
// we'll never gather more candidates than CF1 * max_search_hits
private static final int CUTOFF_FACTOR_1 = 20;
// buckets that have more elements than candidates.size * CF2 are ignored
private static final int CUTOFF_FACTOR_2 = 50;
public KeyValueDatabase() {
this.store = new InMemoryKeyValueStore();
this.max_search_hits = 1000000;
}
public void setConfiguration(Configuration config) {
this.config = config;
}
public void setOverwrite(boolean overwrite) {
}
public void setMaxSearchHits(int max_search_hits) {
this.max_search_hits = max_search_hits;
}
public void setMinRelevance(float min_relevance) {
this.min_relevance = min_relevance;
}
/**
* Returns true iff the database is held entirely in memory, and
* thus is not persistent.
*/
public boolean isInMemory() {
return store.isInMemory();
}
/**
* Add the record to the index.
*/
public void index(Record record) {
// FIXME: check if record is already indexed
// allocate an ID for this record
long id = store.makeNewRecordId();
store.registerRecord(id, record);
// go through ID properties and register them
for (Property p : config.getIdentityProperties())
for (String extid : record.getValues(p.getName()))
store.registerId(id, extid);
// go through lookup properties and register those
for (Property p : config.getLookupProperties()) {
String propname = p.getName();
for (String value : record.getValues(propname)) {
String[] tokens = StringUtils.split(value);
for (int ix = 0; ix < tokens.length; ix++)
store.registerToken(id, propname, tokens[ix]);
}
}
}
/**
* Look up record by identity.
*/
public Record findRecordById(String id) {
return store.findRecordById(id);
}
/**
* Look up potentially matching records.
*/
public Collection<Record> findCandidateMatches(Record record) {
if (DEBUG)
System.out.println("---------------------------------------------------------------------------");
// do lookup on all tokens from all lookup properties
// (we only identify the buckets for now. later we decide how to process
// them)
List<Bucket> buckets = lookup(record);
// preprocess the list of buckets
Collections.sort(buckets);
double score_sum = 0.0;
for (Bucket b : buckets)
score_sum += b.getScore();
double score_so_far = 0.0;
int threshold = buckets.size() - 1;
for (; (score_so_far / score_sum) < min_relevance; threshold--) {
score_so_far += buckets.get(threshold).getScore();
if (DEBUG)
System.out.println("score_so_far: " + (score_so_far/score_sum) + " (" +
threshold + ")");
}
// bucket.get(threshold) made us go over the limit, so we need to step
// one back
threshold++;
if (DEBUG)
System.out.println("Threshold: " + threshold);
// the collection of candidates
Map<Long, Score> candidates = new HashMap();
// go through the buckets that we're going to collect candidates from
int next_bucket = collectCandidates(candidates, buckets, threshold);
// there might still be some buckets left below the threshold. for
// these we go through the existing candidates and check if we can
// find them in the buckets.
bumpScores(candidates, buckets, next_bucket);
if (DEBUG)
System.out.println("candidates: " + candidates.size());
// if the cutoff properties are not set we can stop right here
// FIXME: it's possible to make this a lot cleaner
if (max_search_hits > candidates.size() && min_relevance == 0.0) {
Collection<Record> cands = new ArrayList(candidates.size());
for (Long id : candidates.keySet())
cands.add(store.findRecordById(id));
if (DEBUG)
System.out.println("final: " + cands.size());
return cands;
}
// flatten candidates into an array, prior to sorting etc
int ix = 0;
Score[] scores = new Score[candidates.size()];
double max_score = 0.0;
for (Score s : candidates.values()) {
scores[ix++] = s;
if (s.score > max_score)
max_score = s.score;
if (DEBUG && false)
System.out.println("" + s.id + ": " + s.score);
}
// allow map to be GC-ed
candidates = null;
// filter candidates with min_relevance and max_search_hits. do
// this by turning the scores[] array into a priority queue (on
// .score), then retrieving the best candidates. (gives a big
// performance improvement over sorting the array.)
PriorityQueue pq = new PriorityQueue(scores);
int count = Math.min(scores.length, max_search_hits);
Collection<Record> records = new ArrayList(count);
for (ix = 0; ix < count; ix++) {
Score s = pq.next();
if (s.score >= min_relevance)
records.add(store.findRecordById(s.id));
}
if (DEBUG)
System.out.println("final: " + records.size());
return records;
}
/**
* Flushes all changes to disk. For in-memory databases this is a
* no-op.
*/
public void commit() {
store.commit();
}
/**
* Stores state to disk and closes all open resources.
*/
public void close() {
store.close();
}
public String toString() {
return "KeyValueDatabase(" + store + "), max_search_hits=" +
max_search_hits + ", min_relevance=" + min_relevance;
}
/**
* Goes through the buckets from ix and out, checking for each
* candidate if it's in one of the buckets, and if so, increasing
* its score accordingly. No new candidates are added.
*/
private void bumpScores(Map<Long, Score> candidates,
List<Bucket> buckets,
int ix) {
for (; ix < buckets.size(); ix++) {
Bucket b = buckets.get(ix);
if (b.nextfree > CUTOFF_FACTOR_2 * candidates.size())
return;
double score = b.getScore();
for (Score s : candidates.values())
if (b.contains(s.id))
s.score += score;
}
}
/**
* Goes through the first buckets, picking out candidate records and
* tallying up their scores.
* @return the index of the first bucket we did not process
*/
private int collectCandidates(Map<Long, Score> candidates,
List<Bucket> buckets,
int threshold) {
int ix;
for (ix = 0; ix < threshold &&
candidates.size() < (CUTOFF_FACTOR_1 * max_search_hits); ix++) {
Bucket b = buckets.get(ix);
long[] ids = b.records;
double score = b.getScore();
for (int ix2 = 0; ix2 < b.nextfree; ix2++) {
Score s = candidates.get(ids[ix2]);
if (s == null) {
s = new Score(ids[ix2]);
candidates.put(ids[ix2], s);
}
s.score += score;
}
if (DEBUG)
System.out.println("Bucket " + b.nextfree + " -> " + candidates.size());
}
return ix;
}
/**
* Tokenizes lookup fields and returns all matching buckets in the
* index.
*/
private List<Bucket> lookup(Record record) {
List<Bucket> buckets = new ArrayList();
for (Property p : config.getLookupProperties()) {
String propname = p.getName();
Collection<String> values = record.getValues(propname);
if (values == null)
continue;
for (String value : values) {
String[] tokens = StringUtils.split(value);
for (int ix = 0; ix < tokens.length; ix++) {
Bucket b = store.lookupToken(propname, tokens[ix]);
if (b == null || b.records == null)
continue;
long[] ids = b.records;
if (DEBUG)
System.out.println(propname + ", " + tokens[ix] + ": " + b.nextfree + " (" + b.getScore() + ")");
buckets.add(b);
}
}
}
return buckets;
}
// public so that we can test the priority queue
public static class Score implements Comparable<Score> {
public long id;
public double score;
public Score(long id) {
this.id = id;
}
public int compareTo(Score other) {
if (other.score < score)
return -1;
else if (other.score > score)
return 1;
else
return 0;
}
}
// public so that we can test it
public static class PriorityQueue {
private Score[] scores;
private int size;
public PriorityQueue(Score[] scores) {
this.scores = scores;
this.size = scores.length; // heap is always full to begin with
build_heap();
}
/**
* Turns the random array into a heap.
*/
private void build_heap() {
for (int ix = (size / 2); ix >= 0; ix--)
heapify(ix);
}
/**
* Assuming binary trees rooted at left(ix) and right(ix) are
* already heaped, but scores[ix] may not be heaped, rebalance so
* that scores[ix] winds up in the right place, and subtree rooted
* at ix is correctly heaped.
*/
private void heapify(int ix) {
int left = (ix * 2) + 1;
if (left >= size)
return; // ix is a leaf, and there's nothing to be done
int right = left + 1;
int largest = ix;
if (scores[left].score > scores[ix].score)
largest = left;
if (right < size && scores[right].score > scores[largest].score)
largest = right;
if (largest != ix) {
Score tmp = scores[largest];
scores[largest] = scores[ix];
scores[ix] = tmp;
heapify(largest);
}
}
public Score next() {
Score next = scores[0];
size--;
if (size >= 0) {
scores[0] = scores[size];
scores[size] = null;
heapify(0);
}
return next;
}
}
}