/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.spatial.cursors; import java.util.Iterator; import xxl.core.collections.bags.ArrayBag; import xxl.core.collections.bags.LIFOBag; import xxl.core.collections.sweepAreas.BagSAImplementor; import xxl.core.collections.sweepAreas.ImplementorBasedSweepArea; import xxl.core.comparators.ComparableComparator; import xxl.core.cursors.Cursor; import xxl.core.cursors.joins.SortMergeJoin; import xxl.core.cursors.wrappers.IteratorCursor; import xxl.core.functions.Function; import xxl.core.functions.Tuplify; import xxl.core.predicates.Predicate; import xxl.core.spatial.KPEzCode; import xxl.core.spatial.predicates.OverlapsPredicate; import xxl.core.util.BitSet; /** * The spatial join algorithm based on space-filling curves proposed by Jack * Orenstein. See: [Ore 91] Jack A. Orenstein: An Algorithm for Computing the * Overlay of k-Dimensional Spaces. SSD 1991: 381-400 for a detailed * explanation. See: [DS 01]: Jens-Peter Dittrich, Bernhard Seeger: GESS: a * Scalable Similarity-Join Algorithm for Mining Large Data Sets in High * Dimensional Spaces. ACM SIGKDD-2001. for a review on Orensteins algorithm. * <br> * <br> * Orensteins algorithm is based on a binary recursive partitioning, where the * binary code represents the so-called Z-ordering (z-codes). <br> * <br> * Orensteins algorithm (ORE) assigns each hypercube of the input relations to * disjoint subspaces of the recursive partitioning whose union entirely covers * the hypercube. ORE sorts the two sets of hypercubes derived from the input * relations (including the possible replicates) w.r.t. the lexicographical * ordering of its binary code. After that, the relations are merged using two * main-memory stacks Stack_R and Stack_S. It is guaranteed that for two * adjacent hypercubes in the stack, the prefix property is satisfied for their * associated codes. Only those hypercubes are joined that have the same prefix * code. <br> * <br> * A deficiency of ORE is that the different assignment strategies examined in * [Ore91] cause substantial replication rates. This results in an increase of * the problem space and hence, sorting will be very expensive. Furthermore, ORE * has not addressed the problem of eliminating duplicates in the result set. * <br> * <br> * Note that the method <code>reorganize(final Object * currentStatus)</code> * could actually be implemented with only 1 LOC. For efficiency reasons we use * a somewhat longer version of the method here. <br> * <br> * Use-case: <br> * The main-method of this class contains the complete code to compute a * similarity join of two sets of points using Orensteins algorithm. * * @see xxl.core.cursors.joins.SortMergeJoin * @see xxl.core.spatial.cursors.Mappers * @see xxl.core.spatial.cursors.GESS * @see xxl.core.spatial.cursors.Replicator * */ public class Orenstein extends SortMergeJoin { /** * The sweep area used by the Orenstein algorithm. */ public static class OrensteinSA<T> extends ImplementorBasedSweepArea<T> { /** * internal cursor to the bag (for reasons of efficiency) */ protected LIFOBag<T> bag; /** * Creates a new Orenstein SweepArea * * @param ID * ID of the SweepArea * @param lifoBag * the lifobag for organizing the SweepArea * @param joinPredicate * the predicate of the join */ public OrensteinSA(int ID, LIFOBag<T> lifoBag, Predicate<? super T> joinPredicate) { super(new BagSAImplementor<T>(lifoBag), ID, false, joinPredicate, 2); this.bag = lifoBag; } /** * Creates a new OrensteinSweepArea. Uses an ArrayBag to store elements. * * @param ID * ID of the SweepArea * @param initialCapacity * the initial capacity of the ArrayBag which is used for * organizing the SweepArea * @param joinPredicate * the predicate of the join */ public OrensteinSA(int ID, int initialCapacity, Predicate<? super T> joinPredicate) { this(ID, new ArrayBag<T>(initialCapacity), joinPredicate); } /** * In contrast to the method {@link #expire(Object, int)}, this method removes * all expired elements from a SweepArea without returning them. * The default implementation removes all elements returned by a call to * {@link #expire(Object, int)}.<BR> * In order to perform a more efficient removal, this method should * be overwritten, e.g., by implementing a bulk deletion. * * @param currentStatus The object containing the necessary information * to perform the reorganization step. * @param ID An ID determining from which input this reorganization step * is triggered. * @throws UnsupportedOperationException An UnsupportedOperationException is thrown, if * is method is not supported by this SweepArea. * @throws IllegalStateException Throws an IllegalStateException if * this method is called at an invalid state. */ // fourth version of this method: low level implementation of delete // (fastest version) public void reorganize(final T currentStatus, int ID) throws IllegalStateException { //check nesting-condition for (Cursor<T> cursor = bag.lifoCursor(); cursor.hasNext();) { BitSet top = ((KPEzCode) cursor.next()).getzCode(); BitSet query = ((KPEzCode) currentStatus).getzCode(); if ((query.precision() < top.precision()) || (query.compare(top) != 0)) cursor.remove(); else break; } } /** * This method counts the number of comparisons required for processing * the query. * * @see xxl.core.collections.sweepAreas.SweepArea#query(java.lang.Object, * int) * @param o * The query object. This object is typically probed against * the elements contained in this SweepArea. * @param ID * An ID determining from which input this method is called. * @return All matching elements of this SweepArea are returned as an * iterator. */ public Iterator<T> query(T o, int ID) { comparisons.counter += impl.size(); return super.query(o, ID); } /** * Inserts the given element into this SweepArea. The default implementation * simply forwards this call to the underlying implementor. Thus, * it calls <code>impl.insert(o)</code>. * * @param object The object to be inserted. * @throws IllegalArgumentException Throws an IllegalArgumentException * if something goes wrong with the insertion due to the passed argument. */ public void insert(T object) { super.insert(object); MAX_SWEEPAREA_SIZE = Math.max(MAX_SWEEPAREA_SIZE, size()); //determine maximum size of the sweep area } } /** * A class for counting */ public static class Counter { /** * Internal counter */ public long counter = 0; } /** * Counter for coomparison operations */ public static final Counter comparisons = new Counter(); /** * Maximum size of the sweep area (number of elements) */ public static int MAX_SWEEPAREA_SIZE = 0; /** * Constructs an object of the class Orenstain: * * @param input0 * the first input cursor * @param input1 * the second input cursor * @param joinPredicate * the join predicate * @param newSorter * provides a function that returns sorted inputs * @param newResult * is a function for creating the final result object * @param initialCapacity * the initial capacity of the ArrayBag that is used for * organiting the SweepAreas */ public Orenstein(Cursor input0, Cursor input1, Predicate joinPredicate, Function newSorter, Function newResult, final int initialCapacity) { super(input0, input1, newSorter, newSorter, new OrensteinSA(0, initialCapacity, joinPredicate), new OrensteinSA(1, initialCapacity, joinPredicate), new ComparableComparator(), newResult); } /* * top-level constructor for a self-join */ /* * public Orenstein(Cursor input, Predicate joinPredicate, Function * newSorter, Function newResult, final int initialCapacity, final int * type){ super(input, newSorter, joinPredicate, new * OrensteinSweepArea(initialCapacity), newResult, type ); } */ /** * Constructs an object of the class Orenstain that wraps input iterators as * cursors. * * @param input0 * the first input iterator * @param input1 * the second input iterator * @param joinPredicate * the join predicate * @param newSorter * provides a function that returns sorted inputs * @param newResult * is a function for creating the final result object * @param initialCapacity * the initial capacity of the ArrayBag that is used for * organiting the SweepAreas */ public Orenstein(Iterator input0, Iterator input1, Predicate joinPredicate, Function newSorter, Function newResult, final int initialCapacity) { this(new IteratorCursor(input0), new IteratorCursor(input1), joinPredicate, newSorter, newResult, initialCapacity); } /** * Constructs an object of the class Orenstain that wraps input iterators as * cursors. The join predicate test for overlaps. The method does not apply * a function for creating the final object. * * @param input0 * the first input iterator * @param input1 * the second input iterator * @param newSorter * provides a function that returns sorted inputs * @param initialCapacity * the initial capacity of the ArrayBag that is used for * organiting the SweepAreas */ public Orenstein(Iterator input0, Iterator input1, Function newSorter, final int initialCapacity) { this(input0, input1, OverlapsPredicate.DEFAULT_INSTANCE, newSorter, Tuplify.DEFAULT_INSTANCE, initialCapacity); } /** * constructor for a self-join */ // public Orenstein(Iterator input, Predicate joinPredicate, Function newSorter, Function newResult, final int initialCapacity) { // this(new BufferedCursor(input), joinPredicate, newSorter, newResult, initialCapacity, SortMergeJoin.THETA_JOIN); // } }