package gr.iti.mklab.visual.datastructures; import gnu.trove.list.array.TDoubleArrayList; import gr.iti.mklab.visual.utilities.Result; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import com.aliasi.util.BoundedPriorityQueue; import com.sleepycat.bind.tuple.IntegerBinding; import com.sleepycat.bind.tuple.TupleBinding; import com.sleepycat.bind.tuple.TupleInput; import com.sleepycat.bind.tuple.TupleOutput; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseConfig; import com.sleepycat.je.DatabaseEntry; import com.sleepycat.je.DiskOrderedCursorConfig; import com.sleepycat.je.ForwardCursor; import com.sleepycat.je.OperationStatus; /** * This class is used for indexing vectors and performing k-nearest neighbor queries with exhaustive linear * search. * * @author Eleftherios Spyromitros-Xioufis * */ public class Linear extends AbstractSearchStructure { /** * The vectors are stored in this field. Note that we use a single TDoubleArrayList for all vectors. */ private TDoubleArrayList vectorsList; /** * Whether to use a disk ordered cursor or not. This setting changes how fast the index will be loaded in * main memory. */ public final boolean useDiskOrderedCursor = false; /** * BDB store for persistent storage of the linear index. */ private Database iidToVectorDB; /** * Advanced constructor. * * @param vectorLength * The dimensionality of the VLAD vectors being indexed * @param maxNumVectors * The maximum allowable size (number of vectors) of the index * @param readOnly * If true the persistent store will opened only for read access (allows multiple opens) * @param BDBEnvHome * The BDB environment home directory * @param loadIndexInMemory * Whether to load the index in memory, we can avoid loading the index in memory when we only * want to perform indexing * @param countSizeOnLoad * Whether the load counter will be initialized by the size of the persistent store * @param loadCounter * The initial value of the load counter * @throws Exception */ public Linear(int vectorLength, int maxNumVectors, boolean readOnly, String BDBEnvHome, boolean loadIndexInMemory, boolean countSizeOnLoad, int loadCounter) throws Exception { super(vectorLength, maxNumVectors, readOnly, countSizeOnLoad, loadCounter, loadIndexInMemory); createOrOpenBDBEnvAndDbs(BDBEnvHome); // configuration of the persistent index DatabaseConfig dbConf = new DatabaseConfig(); dbConf.setReadOnly(readOnly); dbConf.setTransactional(transactional); dbConf.setAllowCreate(true); // db will be created if it does not exist iidToVectorDB = dbEnv.openDatabase(null, "vlad", dbConf); // create/open the db using config if (loadIndexInMemory) {// load the existing persistent index in memory // create the memory objects with the appropriate initial size vectorsList = new TDoubleArrayList(maxNumVectors * vectorLength); loadIndexInMemory(); } } /** * Simple constructor. * * @param vectorLength * The dimensionality of the VLAD vectors being indexed * @param maxNumVectors * The maximum allowable size (number of vectors) of the index * @param readOnly * If true the persistent store will opened only for read access (allows multiple opens) * @param BDBEnvHome * The BDB environment home directory * @throws Exception */ public Linear(int vectorLength, int maxNumVectors, boolean readOnly, String BDBEnvHome) throws Exception { this(vectorLength, maxNumVectors, readOnly, BDBEnvHome, true, true, 0); } /** * Append the vectors array with the given vector. The iid of this vector will be equal to the current * value of the loadCounter. * * @param vector * The vector to be indexed * @throws Exception * If the vector's dimensionality is different from vectorLength */ protected void indexVectorInternal(double[] vector) throws Exception { if (vector.length != vectorLength) { throw new Exception("The dimensionality of the vector is wrong!"); } // append the persistent index appendPersistentIndex(vector); // append the ram-based index if (loadIndexInMemory) { vectorsList.add(vector); } } /** * Computes the k-nearest neighbors of the given query vector. The search is exhaustive but includes some * optimizations that make it faster, especially for high dimensional vectors. * * @param k * The number of nearest neighbors to be returned * @param queryVector * The query vector * * @return A bounded priority queue of Result objects, which contains the k nearest neighbors along with * their iids and distances from the query vector, ordered by lowest distance. * @throws Exception * If the index is not loaded in memory * */ protected BoundedPriorityQueue<Result> computeNearestNeighborsInternal(int k, double[] queryVector) throws Exception { BoundedPriorityQueue<Result> nn = new BoundedPriorityQueue<Result>(new Result(), k); double lowest = Double.MAX_VALUE; for (int i = 0; i < (vectorsList.size() / vectorLength); i++) { boolean skip = false; int startIndex = i * vectorLength; double l2distance = 0; for (int j = 0; j < vectorLength; j++) { l2distance += (queryVector[j] - vectorsList.getQuick(startIndex + j)) * (queryVector[j] - vectorsList.getQuick(startIndex + j)); if (l2distance > lowest) { skip = true; break; } } if (!skip) { nn.offer(new Result(i, l2distance)); if (i >= k) { lowest = nn.last().getDistance(); } } } return nn; } /** * Computes the k-nearest neighbors of the vector with the given internal id. The search is exhaustive but * includes some optimizations that make it faster, especially for high dimensional vectors. * * @param k * The number of nearest neighbors to be returned * @param queryVector * The internal id of the query vector * * @return A bounded priority queue of Result objects, which contains the k nearest neighbors along with * their iids and distances from the vector with the given internal id, ordered by lowest * distance. * @throws Exception * If the index is not loaded in memory * */ protected BoundedPriorityQueue<Result> computeNearestNeighborsInternal(int k, int iid) throws Exception { double[] queryVector = getVector(iid); // get the vector with this internal id return computeNearestNeighborsInternal(k, queryVector); } /** * Loads the persistent index in memory. * * @throws Exception */ private void loadIndexInMemory() throws Exception { long start = System.currentTimeMillis(); System.out.println("Loading persistent index in memory."); DatabaseEntry foundKey = new DatabaseEntry(); DatabaseEntry foundData = new DatabaseEntry(); ForwardCursor cursor = null; if (useDiskOrderedCursor) { // disk ordered cursor DiskOrderedCursorConfig docc = new DiskOrderedCursorConfig(); cursor = iidToVectorDB.openCursor(docc); } else { cursor = iidToVectorDB.openCursor(null, null); } int counter = 0; while (cursor.getNext(foundKey, foundData, null) == OperationStatus.SUCCESS && counter < maxNumVectors) { TupleInput input = TupleBinding.entryToInput(foundData); double[] vector = new double[vectorLength]; for (int i = 0; i < vectorLength; i++) { vector[i] = input.readDouble(); } // update ram based index vectorsList.add(vector); counter++; if (counter % 1000 == 0) { System.out.println(counter + " vectors loaded in memory!"); } } cursor.close(); long end = System.currentTimeMillis(); System.out.println(counter + " vectors loaded in " + (end - start) + " ms!"); } /** * Appends the persistent index with the given vector. * * @param vector * The vector */ private void appendPersistentIndex(double[] vector) { TupleOutput output = new TupleOutput(); for (int i = 0; i < vectorLength; i++) { output.writeDouble(vector[i]); } DatabaseEntry data = new DatabaseEntry(); TupleBinding.outputToEntry(output, data); DatabaseEntry key = new DatabaseEntry(); IntegerBinding.intToEntry(loadCounter, key); iidToVectorDB.put(null, key, data); } /** * Returns the vector which was assigned the given internal id or null if the internal id does not exist. * The vector is taken either from the ram-based (if loadIndexInMemory is true) or from the disk-based * index. * * @param iid * The internal id of the vector * @return The vector with the given internal id or null if the internal id does not exist */ public double[] getVector(int iid) { if (iid < 0 || iid > loadCounter) { System.out.println("Internal id " + iid + " is out of range!"); return null; } double[] vector = new double[vectorLength]; if (loadIndexInMemory) { for (int i = 0; i < vectorLength; i++) { vector[i] = vectorsList.getQuick(iid * vectorLength + i); } } else { // get the vector from the BDB structure DatabaseEntry key = new DatabaseEntry(); IntegerBinding.intToEntry(iid, key); DatabaseEntry foundData = new DatabaseEntry(); if (iidToVectorDB.get(null, key, foundData, null) == OperationStatus.SUCCESS) { TupleInput input = TupleBinding.entryToInput(foundData); for (int i = 0; i < vectorLength; i++) { vector[i] = input.readDouble(); } } else { System.out.println("Internal id " + iid + " is in range but vector was not found.."); System.out.println("Index is probably corrupted"); System.exit(0); return null; } } return vector; } @Override protected void closeInternal() { iidToVectorDB.close(); } @Override protected void outputIndexingTimesInternal() { } /** * Writes all vectors in a csv formated file. The id goes first, followed by the vector. * * @param fileName * Full path to the file * @throws Exception */ public void toCSV(String fileName) throws Exception { BufferedWriter out = new BufferedWriter(new FileWriter(new File(fileName))); for (int i = 0; i < loadCounter; i++) { String identifier = getId(i); double[] vector = getVector(i); out.write(identifier); for (int k = 0; k < vector.length; k++) { out.write("," + vector[k]); } out.write("\n"); out.flush(); } out.close(); } }