Orenstein.java example

Explorer
xxl-master
/* XXL: The eXtensible and fleXible Library for data processing

Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger
                        Head of the Database Research Group
                        Department of Mathematics and Computer Science
                        University of Marburg
                        Germany

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 3 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library;  If not, see <http://www.gnu.org/licenses/>. 

    http://code.google.com/p/xxl/

*/

package xxl.core.spatial.cursors;

import java.util.Iterator;

import xxl.core.collections.bags.ArrayBag;
import xxl.core.collections.bags.LIFOBag;
import xxl.core.collections.sweepAreas.BagSAImplementor;
import xxl.core.collections.sweepAreas.ImplementorBasedSweepArea;
import xxl.core.comparators.ComparableComparator;
import xxl.core.cursors.Cursor;
import xxl.core.cursors.joins.SortMergeJoin;
import xxl.core.cursors.wrappers.IteratorCursor;
import xxl.core.functions.Function;
import xxl.core.functions.Tuplify;
import xxl.core.predicates.Predicate;
import xxl.core.spatial.KPEzCode;
import xxl.core.spatial.predicates.OverlapsPredicate;
import xxl.core.util.BitSet;

/**
 * The spatial join algorithm based on space-filling curves proposed by Jack
 * Orenstein. See: [Ore 91] Jack A. Orenstein: An Algorithm for Computing the
 * Overlay of k-Dimensional Spaces. SSD 1991: 381-400 for a detailed
 * explanation. See: [DS 01]: Jens-Peter Dittrich, Bernhard Seeger: GESS: a
 * Scalable Similarity-Join Algorithm for Mining Large Data Sets in High
 * Dimensional Spaces. ACM SIGKDD-2001. for a review on Orensteins algorithm.
 * <br>
 * <br>
 * Orensteins algorithm is based on a binary recursive partitioning, where the
 * binary code represents the so-called Z-ordering (z-codes). <br>
 * <br>
 * Orensteins algorithm (ORE) assigns each hypercube of the input relations to
 * disjoint subspaces of the recursive partitioning whose union entirely covers
 * the hypercube. ORE sorts the two sets of hypercubes derived from the input
 * relations (including the possible replicates) w.r.t. the lexicographical
 * ordering of its binary code. After that, the relations are merged using two
 * main-memory stacks Stack_R and Stack_S. It is guaranteed that for two
 * adjacent hypercubes in the stack, the prefix property is satisfied for their
 * associated codes. Only those hypercubes are joined that have the same prefix
 * code. <br>
 * <br>
 * A deficiency of ORE is that the different assignment strategies examined in
 * [Ore91] cause substantial replication rates. This results in an increase of
 * the problem space and hence, sorting will be very expensive. Furthermore, ORE
 * has not addressed the problem of eliminating duplicates in the result set.
 * <br>
 * <br>
 * Note that the method <code>reorganize(final Object
 *	currentStatus)</code>
 * could actually be implemented with only 1 LOC. For efficiency reasons we use
 * a somewhat longer version of the method here. <br>
 * <br>
 * Use-case: <br>
 * The main-method of this class contains the complete code to compute a
 * similarity join of two sets of points using Orensteins algorithm.
 * 
 * @see xxl.core.cursors.joins.SortMergeJoin
 * @see xxl.core.spatial.cursors.Mappers
 * @see xxl.core.spatial.cursors.GESS
 * @see xxl.core.spatial.cursors.Replicator
 *  
 */
public class Orenstein extends SortMergeJoin {
	/**
	 * The sweep area used by the Orenstein algorithm.
	 */
	public static class OrensteinSA<T> extends ImplementorBasedSweepArea<T> {
		/**
		 * internal cursor to the bag (for reasons of efficiency)
		 */
		protected LIFOBag<T> bag;
		/**
		 * Creates a new Orenstein SweepArea
		 * 
		 * @param ID
		 *            ID of the SweepArea
		 * @param lifoBag
		 *            the lifobag for organizing the SweepArea
		 * @param joinPredicate
		 *            the predicate of the join
		 */
		public OrensteinSA(int ID, LIFOBag<T> lifoBag, Predicate<? super T> joinPredicate) {
			super(new BagSAImplementor<T>(lifoBag), ID, false, joinPredicate, 2);
			this.bag = lifoBag;
		}
		/**
		 * Creates a new OrensteinSweepArea. Uses an ArrayBag to store elements.
		 * 
		 * @param ID
		 *            ID of the SweepArea
		 * @param initialCapacity
		 *            the initial capacity of the ArrayBag which is used for
		 *            organizing the SweepArea
		 * @param joinPredicate
		 *            the predicate of the join
		 */
		public OrensteinSA(int ID, int initialCapacity, Predicate<? super T> joinPredicate) {
			this(ID, new ArrayBag<T>(initialCapacity), joinPredicate);
		}
		
		/**
		 * In contrast to the method {@link #expire(Object, int)}, this method removes
		 * all expired elements from a SweepArea without returning them. 
		 * The default implementation removes all elements returned by a call to 
		 * {@link #expire(Object, int)}.<BR>
		 * In order to perform a more efficient removal, this method should
		 * be overwritten, e.g., by implementing a bulk deletion. 
		 * 
		 * @param currentStatus The object containing the necessary information
		 * 		  to perform the reorganization step.
		 * @param ID An ID determining from which input this reorganization step
		 * 		   is triggered.
		 * @throws UnsupportedOperationException An UnsupportedOperationException is thrown, if
		 * 		   is method is not supported by this SweepArea.
		 * @throws IllegalStateException Throws an IllegalStateException if
		 * 		   this method is called at an invalid state.
		 */
		// fourth version of this method: low level implementation of delete
		// (fastest version)
		public void reorganize(final T currentStatus, int ID)
				throws IllegalStateException { //check nesting-condition
			for (Cursor<T> cursor = bag.lifoCursor(); cursor.hasNext();) {
				BitSet top = ((KPEzCode) cursor.next()).getzCode();
				BitSet query = ((KPEzCode) currentStatus).getzCode();
				if ((query.precision() < top.precision())
						|| (query.compare(top) != 0))
					cursor.remove();
				else
					break;
			}
		}
		/**
		 * This method counts the number of comparisons required for processing
		 * the query.
		 * 
		 * @see xxl.core.collections.sweepAreas.SweepArea#query(java.lang.Object,
		 *      int)
		 * @param o
		 *            The query object. This object is typically probed against
		 *            the elements contained in this SweepArea.
		 * @param ID
		 *            An ID determining from which input this method is called.
		 * @return All matching elements of this SweepArea are returned as an
		 *         iterator.
		 */
		public Iterator<T> query(T o, int ID) {
			comparisons.counter += impl.size();
			return super.query(o, ID);
		}
		
		/**
		 * Inserts the given element into this SweepArea. The default implementation
		 * simply forwards this call to the underlying implementor. Thus,
		 * it calls <code>impl.insert(o)</code>.
		 * 
		 * @param object The object to be inserted.
		 * @throws IllegalArgumentException Throws an IllegalArgumentException
		 * 		if something goes wrong with the insertion due to the passed argument.
		 */
		public void insert(T object) {
			super.insert(object);
			MAX_SWEEPAREA_SIZE = Math.max(MAX_SWEEPAREA_SIZE, size()); 
			//determine maximum size of the sweep area
		}
	}
	
	/**
	 * A class for counting
	 */
	public static class Counter {
		/**
		 * Internal counter
		 */
		public long counter = 0;
	}
	/**
	 * Counter for coomparison operations
	 */
	public static final Counter comparisons = new Counter();
	/**
	 * Maximum size of the sweep area (number of elements)
	 */
	public static int MAX_SWEEPAREA_SIZE = 0;
	/**
	 * Constructs an object of the class Orenstain:
	 * 
	 * @param input0
	 *            the first input cursor
	 * @param input1
	 *            the second input cursor
	 * @param joinPredicate
	 *            the join predicate
	 * @param newSorter
	 *            provides a function that returns sorted inputs
	 * @param newResult
	 *            is a function for creating the final result object
	 * @param initialCapacity
	 *            the initial capacity of the ArrayBag that is used for
	 *            organiting the SweepAreas
	 */
	public Orenstein(Cursor input0, Cursor input1, Predicate joinPredicate,
			Function newSorter, Function newResult, final int initialCapacity) {
		super(input0, input1, newSorter, newSorter, new OrensteinSA(0,
				initialCapacity, joinPredicate), new OrensteinSA(1,
				initialCapacity, joinPredicate),
				new ComparableComparator(), newResult);
	}
	/*
	 * top-level constructor for a self-join
	 */
	/*
	 * public Orenstein(Cursor input, Predicate joinPredicate, Function
	 * newSorter, Function newResult, final int initialCapacity, final int
	 * type){ super(input, newSorter, joinPredicate, new
	 * OrensteinSweepArea(initialCapacity), newResult, type ); }
	 */
	/**
	 * Constructs an object of the class Orenstain that wraps input iterators as
	 * cursors.
	 * 
	 * @param input0
	 *            the first input iterator
	 * @param input1
	 *            the second input iterator
	 * @param joinPredicate
	 *            the join predicate
	 * @param newSorter
	 *            provides a function that returns sorted inputs
	 * @param newResult
	 *            is a function for creating the final result object
	 * @param initialCapacity
	 *            the initial capacity of the ArrayBag that is used for
	 *            organiting the SweepAreas
	 */
	public Orenstein(Iterator input0, Iterator input1, Predicate joinPredicate,
			Function newSorter, Function newResult, final int initialCapacity) {
		this(new IteratorCursor(input0), new IteratorCursor(input1),
				joinPredicate, newSorter, newResult, initialCapacity);
	}
	/**
	 * Constructs an object of the class Orenstain that wraps input iterators as
	 * cursors. The join predicate test for overlaps. The method does not apply
	 * a function for creating the final object.
	 * 
	 * @param input0
	 *            the first input iterator
	 * @param input1
	 *            the second input iterator
	 * @param newSorter
	 *            provides a function that returns sorted inputs
	 * @param initialCapacity
	 *            the initial capacity of the ArrayBag that is used for
	 *            organiting the SweepAreas
	 */
	public Orenstein(Iterator input0, Iterator input1, Function newSorter,
			final int initialCapacity) {
		this(input0, input1, OverlapsPredicate.DEFAULT_INSTANCE, newSorter,
				Tuplify.DEFAULT_INSTANCE, initialCapacity);
	}
	/**
	 * constructor for a self-join
	 */
//	public Orenstein(Iterator input, Predicate joinPredicate, Function newSorter, Function newResult, final int initialCapacity) {
//		this(new BufferedCursor(input), joinPredicate, newSorter, newResult, initialCapacity, SortMergeJoin.THETA_JOIN);
//	}
	
}