package gr.iti.mklab.visual.datastructures;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.util.Date;
import java.util.List;
import com.aliasi.util.BoundedPriorityQueue;
import com.javadocmd.simplelatlng.LatLng;
import com.sleepycat.bind.tuple.IntegerBinding;
import com.sleepycat.bind.tuple.StringBinding;
import com.sleepycat.bind.tuple.TupleBinding;
import com.sleepycat.bind.tuple.TupleInput;
import com.sleepycat.bind.tuple.TupleOutput;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;
import com.sleepycat.je.EnvironmentNotFoundException;
import com.sleepycat.je.ForwardCursor;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationStatus;
import com.sleepycat.persist.EntityStore;
import com.sleepycat.persist.PrimaryIndex;
import com.sleepycat.persist.StoreConfig;
import gr.iti.mklab.visual.utilities.Answer;
import gr.iti.mklab.visual.utilities.AnswerWithGeolocation;
import gr.iti.mklab.visual.utilities.MetaDataEntity;
import gr.iti.mklab.visual.utilities.Result;
/**
* This class abstracts operations related to persistence and id lookup from the actual indexing structures.
* The term id is used for the name or other identifier of the vectors being indexed while the term iid
* (internal id) is used for the id assigned to a vector internally be each indexing structure. <br>
* An id is of type String and is kept in disk while iid is of type int and is loaded in memory.<br>
* Berkeley DB (BDB) is used for efficient persistent storage.
*
* @author Eleftherios Spyromitros-Xioufis
*/
public abstract class AbstractSearchStructure {
/**
* The total memory to be used by the BDB, 512Mb by default. Larger values will allow faster id lookup.
*/
protected long cacheSize = 1024 * 1024 * 512;
/**
* Whether the environment will be transactional. If true, ensures that the dbs will not be corrupted.
* <br>
* For more information on what this means, refer to the BDB documentation.
*/
protected boolean transactional = false;
/**
* The length of the raw vectors being indexed.
*/
protected int vectorLength;
/**
* Keeps track of the total number of indexed vectors, acts as an auto-increment primary key field.
*/
protected int loadCounter;
/**
* Whether the index will be loaded in memory. We can avoid loading the index in memory when we only want
* to perform indexing.
*/
protected boolean loadIndexInMemory;
/**
* The maximum number of vectors that can be indexed.
*/
protected final int maxNumVectors;
/**
* Whether to initialize the load counter by counting the size of the {@link #iidToIdDB}. This operation
* incurs a large cost when loading very large indices. It can be set to false for efficiency reasons. In
* that case, the load counter should be set manually.
*/
private boolean countSizeOnLoad;
/**
* Whether the index should open only for read access. This allows multiple opens of the index.
*/
protected boolean readOnly;
/**
* The database environment. Access to this field is needed by specific indexing structures that implement
* persistence.
*/
protected Environment dbEnv;
/**
* BDB store holding id to iid mappings, required during indexing to fast check if a name is already
* indexed.
*/
protected Database idToIidDB;
/**
* BDB store holding iid to id mappings, required for name look-up during nn search.
*/
protected Database iidToIdDB;
/**
* BDB store holding iid to longitude-latitude mappings, required for geolocation look-up during nn
* search.
*/
protected Database iidToGeolocationDB;
/**
* BDB store holding iid to metadata mappings, required for metadata look-up during nn search.
*/
protected EntityStore iidToMetadataDB;
/**
* Average time taken for internal vector indexing operations.
*/
private long totalInternalVectorIndexingTime;
/**
* Average time taken to create an id to idd and the reverse mapping.
*/
private long totalIdMappingTime;
/**
* Average total time taken to index a vector.
*/
private long totalVectorIndexingTime;
/**
* Whether to create/load geolocation db.
*/
protected final boolean useGeolocation = false;
/**
* Whether to create/load metadata db.
*/
protected final boolean useMetaData = false;
/**
* Constructor. Used when we count the size of the database when opening it.
*
* @param vectorLength
* The dimensionality of the vectors being indexed
* @param maxNumVectors
* The maximum allowable size (number of vectors) of the index
* @param readOnly
* If true the persistent store will opened only for read access (allows multiple opens)
*/
protected AbstractSearchStructure(int vectorLength, int maxNumVectors, boolean readOnly) {
this(vectorLength, maxNumVectors, readOnly, true, 0, true);
}
/**
* Constructor. Used when we want to avoid counting the database size and to use a preset value for the
* load counter.
*
* @param vectorLength
* The dimensionality of the VLAD vectors being indexed
* @param maxNumVectors
* The maximum allowable size (number of vectors) of the index
* @param readOnly
* If true the persistent store will opened only for read access (allows multiple opens)
* @param countSizeOnLoad
* Whether the load counter will be initialized by the size of the persistent store
* @param loadCounter
* The initial value of the load counter
* @param loadIndexInMemory
* Whether to load the index in memory, we can avoid loading the index in memory when we only
* want to perform indexing
*/
protected AbstractSearchStructure(int vectorLength, int maxNumVectors, boolean readOnly,
boolean countSizeOnLoad, int loadCounter, boolean loadIndexInMemory) {
this.vectorLength = vectorLength;
this.loadCounter = loadCounter;
this.maxNumVectors = maxNumVectors;
this.readOnly = readOnly;
this.countSizeOnLoad = countSizeOnLoad;
this.loadIndexInMemory = loadIndexInMemory;
}
/**
* Constructor. Used when we want to avoid counting the database size and to use a preset value for the
* load counter.
*
* @param vectorLength
* The dimensionality of the VLAD vectors being indexed
* @param maxNumVectors
* The maximum allowable size (number of vectors) of the index
* @param readOnly
* If true the persistent store will opened only for read access (allows multiple opens)
* @param countSizeOnLoad
* Whether the load counter will be initialized by the size of the persistent store
* @param loadCounter
* The initial value of the load counter
* @param loadIndexInMemory
* Whether to load the index in memory, we can avoid loading the index in memory when we only
* want to perform indexing
* @param cacheSize
* The size of the cache in Megabytes
*/
protected AbstractSearchStructure(int vectorLength, int maxNumVectors, boolean readOnly,
boolean countSizeOnLoad, int loadCounter, boolean loadIndexInMemory, long cachesize) {
this.vectorLength = vectorLength;
this.loadCounter = loadCounter;
this.maxNumVectors = maxNumVectors;
this.readOnly = readOnly;
this.countSizeOnLoad = countSizeOnLoad;
this.loadIndexInMemory = loadIndexInMemory;
this.cacheSize = cachesize * 1024 * 1024;
}
/**
* Updates the index with the given vector. This is a synchronized method, i.e. when a thread calls this
* method, all other threads wait for the first thread to complete before executing the method. This
* ensures that the persistent BDB store will remain consistent when multiple threads call the indexVector
* method.
*
* @param id
* The id of the vector
* @param vector
* The vector
* @return True if the vector is successfully indexed, false otherwise.
* @throws Exception
*/
public synchronized boolean indexVector(String id, double[] vector) throws Exception {
long startIndexing = System.currentTimeMillis();
// check if we can index more vectors
if (loadCounter >= maxNumVectors) {
System.out.println("Maximum index capacity reached, no more vectors can be indexed!");
return false;
}
// check if name is already indexed
if (isIndexed(id)) {
System.out.println("Vector '" + id + "' already indexed!");
return false;
}
// do the indexing
// persist id to name and the reverse mapping
long startMapping = System.currentTimeMillis();
createMapping(id);
totalIdMappingTime += System.currentTimeMillis() - startMapping;
// method specific indexing
long startInternalIndexing = System.currentTimeMillis();
indexVectorInternal(vector);
totalInternalVectorIndexingTime += System.currentTimeMillis() - startInternalIndexing;
loadCounter++; // increase the loadCounter
if (loadCounter % 100 == 0) { // debug message
System.out.println(new Date() + " # indexed vectors: " + loadCounter);
}
totalVectorIndexingTime += System.currentTimeMillis() - startIndexing;
return true;
}
/**
* This method should be implemented in all subclasses and do the operations required for indexing the
* given vector.
*
* @param vector
* The vector to be indexed
* @throws Exception
*/
protected abstract void indexVectorInternal(double[] vector) throws Exception;
/**
* This method returns an {@link Answer} object, which contains the k nearest neighbors along with their
* ids and distances from the query vector, ordered by lowest distance. The methods calls
* {@link #computeNearestNeighborsInternal(int, double[])} and then performs name lookup.
*
* @param k
* The number of nearest neighbors to return
* @param queryVector
* The query vector
* @return The answer
* @throws Exception
*/
public Answer computeNearestNeighbors(int k, double[] queryVector) throws Exception {
if (!loadIndexInMemory) {
throw new Exception("Cannot execute query because the index is not loaded in memory!");
}
long start = System.nanoTime();
BoundedPriorityQueue<Result> nnQueue = computeNearestNeighborsInternal(k, queryVector);
long indexSearchTime = System.nanoTime() - start;
return lookUp(nnQueue, indexSearchTime);
}
/**
* This method returns a bounded priority queue of Result objects, which contains the k nearest neighbors
* along with their iids and distances from the query vector, ordered by lowest distance. Subclasses
* should implement this method.
*
* @param k
* The number of nearest neighbors to return
* @param queryVector
* The query vector
* @return A bounded priority queue of Result objects
* @throws Exception
*/
protected abstract BoundedPriorityQueue<Result> computeNearestNeighborsInternal(int k,
double[] queryVector) throws Exception;
/**
* This method returns an {@link Answer} object, which contains the k nearest neighbors along with their
* ids and distances from the query vector, ordered by lowest distance. The methods calls
* {@link #computeNearestNeighborsInternal(int, int)} and then performs id lookup.
*
* @param k
* The number of nearest neighbors to return
* @param queryId
* The id of the query vector
* @return The answer
* @throws Exception
*/
public Answer computeNearestNeighbors(int k, String queryId) throws Exception {
int internalIdQuery = getInternalId(queryId);
long start = System.nanoTime();
BoundedPriorityQueue<Result> nnQueue = computeNearestNeighborsInternal(k, internalIdQuery);
long indexSearchTime = System.nanoTime() - start;
return lookUp(nnQueue, indexSearchTime);
}
/**
* This method returns a bounded priority queue of Result objects, which contains the k nearest neighbors
* along with their iids and distances from the query vector, ordered by lowest distance. Subclasses
* should implement this method.
*
* @param k
* The number of nearest neighbors to return
* @param iid
* The internal id of the query vector
* @return A bounded priority queue of Result objects
* @throws Exception
*/
protected abstract BoundedPriorityQueue<Result> computeNearestNeighborsInternal(int k, int iid)
throws Exception;
private Answer lookUp(BoundedPriorityQueue<Result> nnQueue, long indexSearchTime) {
Result[] nn = new Result[nnQueue.size()];
nn = nnQueue.toArray(nn);
String[] ids = new String[nnQueue.size()];
double[] distances = new double[nnQueue.size()];
long start = System.nanoTime();
for (int i = 0; i < nn.length; i++) { // attach external ids to the results
distances[i] = nn[i].getDistance();
int iid = nn[i].getId();
ids[i] = getId(iid);
}
long nameLookUpTime = System.nanoTime() - start;
if (!useMetaData) {
return new Answer(ids, distances, nameLookUpTime, indexSearchTime);
} else {
start = System.nanoTime();
LatLng[] geolocations = new LatLng[nn.length];
for (int i = 0; i < nn.length; i++) { // attach external ids to the results
int iid = nn[i].getId();
geolocations[i] = getGeolocation(iid);
}
long geolocationLookupTime = System.nanoTime() - start;
return new AnswerWithGeolocation(ids, distances, geolocations, nameLookUpTime, indexSearchTime,
geolocationLookupTime);
}
}
/**
* Returns the internal id assigned to the vector with the given id or -1 if the id is not found. Accesses
* the BDB store!
*
* @param id
* The id of the vector
* @return The internal id assigned to this vector or -1 if the id is not found.
*/
public int getInternalId(String id) {
DatabaseEntry key = new DatabaseEntry();
StringBinding.stringToEntry(id, key);
DatabaseEntry data = new DatabaseEntry();
// check if the id already exists in id to iid database
if ((idToIidDB.get(null, key, data, null) == OperationStatus.SUCCESS)) {
return IntegerBinding.entryToInt(data);
} else {
return -1;
}
}
/**
* Returns the id of the vector which was assigned the given internal id or null if the internal id does
* not exist. Accesses the BDB store!
*
* @param iid
* The internal id of the vector
* @return The id mapped to the given internal id or null if the internal id does not exist
*/
public String getId(int iid) {
if (iid < 0 || iid > loadCounter) {
System.out.println("Internal id " + iid + " is out of range!");
return null;
}
DatabaseEntry key = new DatabaseEntry();
IntegerBinding.intToEntry(iid, key);
DatabaseEntry data = new DatabaseEntry();
if ((iidToIdDB.get(null, key, data, null) == OperationStatus.SUCCESS)) {
return StringBinding.entryToString(data);
} else {
System.out.println("Internal id " + iid + " is in range but id was not found..");
System.out.println("Index is probably corrupted");
System.exit(0);
return null;
}
}
/**
* Returns a {@link LatLng} object with the geolocation of the vector with the given internal id or null
* if the internal id does not exist. Accesses the BDB store!
*
* @param iid
* The internal id of the vector
* @return The geolocation mapped to the given internal id or null if the internal id does not exist
*/
public LatLng getGeolocation(int iid) {
if (iid < 0 || iid > loadCounter) {
System.out.println("Internal id " + iid + " is out of range!");
return null;
}
DatabaseEntry key = new DatabaseEntry();
IntegerBinding.intToEntry(iid, key);
DatabaseEntry data = new DatabaseEntry();
if ((iidToGeolocationDB.get(null, key, data, null) == OperationStatus.SUCCESS)) {
TupleInput input = TupleBinding.entryToInput(data);
double latitude = input.readDouble();
double longitude = input.readDouble();
LatLng geolocation = new LatLng(latitude, longitude);
return geolocation;
} else {
System.out.println("Internal id " + iid + " is in range but gelocation was not found.");
return null;
}
}
/**
* Returns a {@link MetaDataEntity} object with the metadata of the vector with the given internal id or
* null if the internal id does not exist. Accesses the BDB store!
*
* @param iid
* The internal id of the vector
* @return The metadata mapped to the given internal id or null if the internal id does not exist
*/
public MetaDataEntity getMetadata(int iid) throws Exception {
if (iid < 0 || iid > loadCounter) {
System.out.println("Internal id " + iid + " is out of range!");
return null;
}
PrimaryIndex<Integer, MetaDataEntity> primaryIndex = iidToMetadataDB.getPrimaryIndex(Integer.class,
MetaDataEntity.class);
return primaryIndex.get(null, iid, null);
}
/**
* This method is used to set the geolocation of a previously indexed vector. If the geolocation is
* already set, this method replaces it.
*
* @param iid
* The internal id of the vector
* @param latitude
* @param longitude
* @return true if geolocation is successfully set, false otherwise
*/
public boolean setGeolocation(int iid, double latitude, double longitude) {
if (iid < 0 || iid > loadCounter) {
System.out.println("Internal id " + iid + " is out of range!");
return false;
}
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry data = new DatabaseEntry();
IntegerBinding.intToEntry(iid, key);
TupleOutput output = new TupleOutput();
output.writeDouble(latitude);
output.writeDouble(longitude);
TupleBinding.outputToEntry(output, data);
if (iidToGeolocationDB.put(null, key, data) == OperationStatus.SUCCESS) {
return true;
} else {
return false;
}
}
/**
* This method is used to set the metadata of a previously indexed vector. If the metadata is already set,
* this methods replaces it.
*
* @param iid
* The internal id of the vector
* @param metaData
* A java object of any class with the @persistent annotation
* @return true if metadata is successfully set, false otherwise
*/
public boolean setMetadata(int iid, Object metaData) {
if (iid < 0 || iid > loadCounter) {
System.out.println("Internal id " + iid + " is out of range!");
return false;
}
MetaDataEntity mde = new MetaDataEntity(iid, metaData);
PrimaryIndex<Integer, MetaDataEntity> primaryIndex = iidToMetadataDB.getPrimaryIndex(Integer.class,
MetaDataEntity.class);
if (primaryIndex.contains(iid)) {
primaryIndex.put(null, mde);
return true;
} else {
return false;
}
}
/**
* <b>{@link #getInternalId(String)} can always be called instead of this method at the same cost!</b>
* <br>
* Checks if the vector with the given id is already indexed. This method is useful to avoid re-indexing
* the same vector. Its convention is that if the given name is already in idToIidBDB, then the vector is
* indexed in all other structures e.g. iidToIdBDB. The rest of the checks are avoided for efficiency.
* Accesses the BDB store!
*
* @param id
* The id the vector
* @return true if the vector is indexed, false otherwise
*/
public boolean isIndexed(String id) {
DatabaseEntry key = new DatabaseEntry();
StringBinding.stringToEntry(id, key);
DatabaseEntry data = new DatabaseEntry();
if ((idToIidDB.get(null, key, data, null) == OperationStatus.SUCCESS)) {
return true;
} else {
return false;
}
}
/**
* This method is used to create a persistent mapping between the given id and an internal id (equal to
* the current value of {@link #loadCounter}). Should be called every time that a new vector is indexed.
*
* @param id
* The id
*/
protected void createMapping(String id) {
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry data = new DatabaseEntry();
IntegerBinding.intToEntry(loadCounter, key);
StringBinding.stringToEntry(id, data);
iidToIdDB.put(null, key, data); // required during name look-up
idToIidDB.put(null, data, key); // required during indexing
}
/**
* This method creates and/or opens the BDB databases with the appropriate parameters.
*
* @throws Exception
*/
private void createOrOpenBDBDbs() throws Exception {
// configuration for the mapping dbs
DatabaseConfig dbConfig = new DatabaseConfig();
dbConfig.setAllowCreate(true); // db will be created if it does not exist
dbConfig.setReadOnly(readOnly);
dbConfig.setTransactional(transactional);
// create/open mapping dbs using config
iidToIdDB = dbEnv.openDatabase(null, "idToName", dbConfig);
// if countSizeOnLoad is true, the id-name mappings are counted and the loadCounter is initialized
if (countSizeOnLoad) {
System.out.println(new Date() + " counting index size started ");
int idToNameMappings = (int) iidToIdDB.count();
loadCounter = Math.min(idToNameMappings, maxNumVectors);
System.out.println(new Date() + " counting index size ended ");
System.out.println("Index size: " + loadCounter);
}
idToIidDB = dbEnv.openDatabase(null, "nameToId", dbConfig);
if (useGeolocation) {// create/open geolocation db using config
iidToGeolocationDB = dbEnv.openDatabase(null, "idToGeolocation", dbConfig);
}
if (useMetaData) {
StoreConfig storeConfig = new StoreConfig(); // configuration of the entity store
storeConfig.setAllowCreate(true); // store will be created if it does not exist
storeConfig.setReadOnly(readOnly);
storeConfig.setTransactional(transactional);
iidToMetadataDB = new EntityStore(dbEnv, "idToMetadata", storeConfig);
// int nameToMetadataMappings = (int) nameToMetadataBDB.getPrimaryIndex(String.class,
// MediaFeedData.class).count(); // counting the size of an EntityStore
}
}
/**
* This is a utility method that can be used to dump the contents of the iidToIdDB to a txt file.
*
* @param dumpFilename
* Full path to the file where the dump will be written.
* @throws Exception
*/
public void dumpiidToIdDB(String dumpFilename) throws Exception {
DatabaseEntry foundKey = new DatabaseEntry();
DatabaseEntry foundData = new DatabaseEntry();
ForwardCursor cursor = iidToIdDB.openCursor(null, null);
BufferedWriter out = new BufferedWriter(new FileWriter(new File(dumpFilename)));
while (cursor.getNext(foundKey, foundData, LockMode.DEFAULT) == OperationStatus.SUCCESS) {
int iid = IntegerBinding.entryToInt(foundKey);
String id = StringBinding.entryToString(foundData);
out.write(iid + " " + id + "\n");
}
cursor.close();
out.close();
}
/**
* This is a utility method that can be used to dump the contents of the idToIidDB to a txt file.
*
* @param dumpFilename
* Full path to the file where the dump will be written.
* @throws Exception
*/
public void dumpidToIidDB(String dumpFilename) throws Exception {
DatabaseEntry foundKey = new DatabaseEntry();
DatabaseEntry foundData = new DatabaseEntry();
ForwardCursor cursor = idToIidDB.openCursor(null, null);
BufferedWriter out = new BufferedWriter(new FileWriter(new File(dumpFilename)));
while (cursor.getNext(foundKey, foundData, LockMode.DEFAULT) == OperationStatus.SUCCESS) {
int iid = IntegerBinding.entryToInt(foundData);
String id = StringBinding.entryToString(foundKey);
out.write(id + " " + iid + "\n");
}
cursor.close();
out.close();
}
/**
* This method creates and/or opens the BDB environment in the supplied directory. <br>
* TODO: The configuration can be tuned for being more efficient / less persistent!
*
* @param BDBEnvHome
* The directory where the BDB environment will be created.
* @throws Exception
*/
private void createOrOpenBDBEnv(String BDBEnvHome) throws Exception {
// create the BDBEnvHome directory if it does not exist
File BDBEnvHomeDir = new File(BDBEnvHome);
if (!BDBEnvHomeDir.isDirectory()) {
boolean success = BDBEnvHomeDir.mkdir();
if (success) {
System.out.println(BDBEnvHome + " directory created.");
}
} else {
System.out.println(BDBEnvHome + " directory exists.");
}
// configuration of the bdb environment, applies to all dbs in this environment
EnvironmentConfig envConf = new EnvironmentConfig();
envConf.setAllowCreate(false); // initially we do not allow create
envConf.setReadOnly(readOnly);
envConf.setTransactional(transactional);
envConf.setCacheSize(cacheSize);
// Instantiate the Environment. This opens it and also possibly creates it.
try {
dbEnv = new Environment(BDBEnvHomeDir, envConf);
System.out.println("An existing BDB environment was found.");
} catch (EnvironmentNotFoundException e) {
envConf.setAllowCreate(true);
dbEnv = new Environment(BDBEnvHomeDir, envConf);
System.out.println("A new BDB environment was created.");
}
// printing information about the BDB environment
System.out.println("== BDB environment configuration ===");
System.out.println(dbEnv.getConfig());
System.out.println("== BDB environment database names ===");
List<String> dbNames = dbEnv.getDatabaseNames();
for (String dbName : dbNames) {
System.out.println(dbName);
}
System.out.println("");
}
/**
* This method creates or opens (if it already exists) the BDB environment and dbs.
*
* @param BDBEnvHome
* The directory where the BDB environment will be created
* @throws Exception
*/
protected void createOrOpenBDBEnvAndDbs(String BDBEnvHome) throws Exception {
createOrOpenBDBEnv(BDBEnvHome);
createOrOpenBDBDbs();
}
/**
* Returns the current value of the loadCounter.
*
* @return
*/
public int getLoadCounter() {
return loadCounter;
}
/**
* This method can be called to output indexing time measurements.
*/
public void outputIndexingTimes() {
System.out.println(
(double) totalInternalVectorIndexingTime / loadCounter + " ms => internal indexing time");
System.out.println((double) totalIdMappingTime / loadCounter + " ms => id mapping time");
System.out.println((double) totalVectorIndexingTime / loadCounter + " ms => total indexing time");
outputIndexingTimesInternal();
}
/**
* Should output index specific time measurements.
*/
protected abstract void outputIndexingTimesInternal();
/**
* This method closes the open BDB environment and databases.
*/
public void close() {
if (dbEnv != null) {
// closing dbs
iidToIdDB.close();
idToIidDB.close();
if (useGeolocation) {
iidToGeolocationDB.close();
}
if (useMetaData) {
iidToMetadataDB.close();
}
closeInternal();
dbEnv.close(); // closing env
} else {
System.out.println("BDB environment is null!");
}
}
/**
* Each subclass should implement this method to close the BDB databases that it uses.
*/
protected abstract void closeInternal();
}