/* XXL: The eXtensible and fleXible Library for data processing
Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger
Head of the Database Research Group
Department of Mathematics and Computer Science
University of Marburg
Germany
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 3 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; If not, see <http://www.gnu.org/licenses/>.
http://code.google.com/p/xxl/
*/
package xxl.core.cursors.distincts;
import java.util.Iterator;
import xxl.core.collections.bags.Bag;
import xxl.core.collections.bags.ListBag;
import xxl.core.collections.queues.ArrayQueue;
import xxl.core.collections.queues.Queue;
import xxl.core.cursors.AbstractCursor;
import xxl.core.cursors.Cursor;
import xxl.core.cursors.Cursors;
import xxl.core.cursors.wrappers.QueueCursor;
import xxl.core.functions.AbstractFunction;
import xxl.core.functions.Function;
import xxl.core.predicates.Equal;
import xxl.core.predicates.Predicate;
import xxl.core.predicates.RightBind;
/**
* A nested-loops implementation of the distinct operator, i.e., all duplicates
* contained in a cursor will be removed. Depending on the specified memory
* size and object size as many elements of the input cursor as possible will
* be inserted into a temporal bag (typically located in main memory). To
* guarantee that no duplicates will be inserted into it, the bag is searched
* for duplicates with the help of a user defined predicate. If not all
* elements can be inserted into the bag they will be temporarily stored in a
* queue, that is typically resided in external memory, and will be inserted
* when the bag has been emptied due to calls to the <code>next</code> method
* of this class.
*
* <p><b>Example usage (1):</b>
* <code><pre>
* NestedLoopsDistinct<Integer> distinct = new NestedLoopsDistinct<Integer>(
* new DiscreteRandomNumber(new JavaDiscreteRandomWrapper(21), 30),
* 32,
* 4
* );
*
* distinct.open();
*
* while(distinct.hasNext())
* System.out.println(distinct.next());
*
* distinct.close();
* </pre></code>
* The input cursor shown in this example deliveres 30 randomly distributed
* integer numbers contained in the interval [0, 20]. The main memory size is
* set to 32 bytes and the object size to 4 bytes. So the
* {@link ListBag list-bag} generated by the default factory method that is
* used in this case as default is able to hold a maximum 3 elements in main
* memory. The remaining one element, that fits in main memory, is reserved for
* the comparison of two elements. Due to the fact that not all elements of the
* input cursor can be stored in the temporal main memory bag, the remaining
* elements will be stored in an {@link ArrayQueue array-queue} returned by the
* factory method that is used as default. Normally this queue should be placed
* in external memory. Running this example shows that no duplicates are
* returned by this operator.</p>
*
* <p><b>Note:</b> If an input iteration is given by an object of the class
* {@link Iterator}, i.e., it does not support the <code>peek</code> operation,
* it is internally wrapped to a cursor.</p>
*
* @param <E> the type of the elements returned by this distinct operator.
* @see java.util.Iterator
* @see xxl.core.cursors.Cursor
* @see xxl.core.collections.bags.Bag
* @see xxl.core.collections.queues.Queue
* @see xxl.core.cursors.distincts.SortBasedDistinct
* @see xxl.core.relational.cursors.NestedLoopsDistinct
*/
public class NestedLoopsDistinct<E> extends AbstractCursor<E> {
/**
* The input cursor delivering the elements for the distinct operation.
*/
protected Cursor<? extends E> cursor;
/**
* The result cursor delivering only distinct elements of the input cursor.
*/
protected Cursor<E> results = null;
/**
* A queue used to store the remaining elements during the operation if not
* all elements can be stored in the temporal main memory bag.
*/
protected Queue<E> remainder = null;
/**
* A parameterless function returning an empty bag on demand.
*/
protected Function<?, ? extends Bag<E>> newBag;
/**
* A parameterless function returning an empty queue on demand.
*/
protected Function<?, ? extends Queue<E>> newQueue;
/**
* An unary predicate determining if two elements are equal. If the
* returned value is <code>true</code> the given element and the next
* element of the internal used cursor that holds all remaining elements do
* match.
*/
protected Predicate<? super E> predicate;
/**
* The maximum number of elements that can be stored in the bag returned by
* the function <code>newBag</code>.
*/
protected int maxTuples;
/**
* An internal used flag signaling if the elements inserted into the bag
* are delivered from the input cursor or the queue <code>remainder</code>.
*/
protected boolean initialized = false;
/**
* Creates a new instance of the nested-loops distinct operator. The input
* iterator is wrapped to a cursor. Determines the maximum number of
* elements that can be stored in the bag used for the temporal storage of
* the elements of the input cursor:
* <pre>
* maxTuples = memSize / objectSize - 1
* </pre>
*
* @param input the input iterator delivering the elements.
* @param memSize the maximum amount of available main memory (bytes) for
* the bag.
* @param objectSize the size (bytes) needed to store one element.
* @param predicate the binary predicate returning <code>true</code> if two
* elements are equal.
* @param newBag a parameterless function returning an empty bag.
* @param newQueue a parameterless function returning an empty queue.
* @throws IllegalArgumentException if not enough main memory is available.
*/
public NestedLoopsDistinct(Iterator<? extends E> input, int memSize, int objectSize, Predicate<? super E> predicate, Function<?, ? extends Bag<E>> newBag, Function<?, ? extends Queue<E>> newQueue) throws IllegalArgumentException {
this.cursor = Cursors.wrap(input);
this.newBag = newBag;
this.newQueue = newQueue;
this.predicate = predicate;
this.maxTuples = memSize / objectSize - 1;
if (memSize < 2*objectSize)
throw new IllegalArgumentException("insufficient main memory available.");
}
/**
* Creates a new instance of the nested-loops distinct operator. The input
* iterator is wrapped to a cursor. Determines the maximum number of
* elements that can be stored in the bag used for the temporal storage of
* the elements of the input cursor:
* <pre>
* maxTuples = memSize / objectSize - 1
* </pre>
* Uses default factory methods for list-bags and array-queues. Determines
* the equality between two elements with the help of the default instance
* of the predicate {@link Equal}.
*
* @param input the input iterator delivering the elements.
* @param memSize the maximum amount of available main memory (bytes) for
* the bag.
* @param objectSize the size (bytes) needed to store one element.
* @throws IllegalArgumentException if not enough main memory is available.
*/
public NestedLoopsDistinct(Iterator<? extends E> input, int memSize, int objectSize) throws IllegalArgumentException {
this(
input,
memSize,
objectSize,
Equal.DEFAULT_INSTANCE,
new AbstractFunction<Object, ListBag<E>>() {
public ListBag<E> invoke() {
return new ListBag<E>();
}
},
new AbstractFunction<Object, ArrayQueue<E>>() {
public ArrayQueue<E> invoke() {
return new ArrayQueue<E>();
}
}
);
}
/**
* Opens the nested-loops distinct operator, i.e., signals the cursor to
* reserve resources, open the input iteration, etc. Before a cursor has
* been opened calls to methods like <code>next</code> or <code>peek</code>
* are not guaranteed to yield proper results. Therefore <code>open</code>
* must be called before a cursor's data can be processed. Multiple calls
* to <code>open</code> do not have any effect, i.e., if <code>open</code>
* was called the cursor remains in the state <i>opened</i> until its
* <code>close</code> method is called.
*
* <p>Note, that a call to the <code>open</code> method of a closed cursor
* usually does not open it again because of the fact that its state
* generally cannot be restored when resources are released respectively
* files are closed.</p>
*/
public void open() {
if (isOpened)
return;
super.open();
cursor.open();
}
/**
* Closes the nested-loops distinct operator. Signals the operator to clean
* up resources, close the input cursor as well as the internally used bag
* and queue. After a call to <code>close()</code> calls to methods like
* <code>next</code> or <code>peek</code> are not guaranteed to yield
* proper results. Multiple calls to <code>close</code> do not have any
* effect, i.e., if <code>close</code> was called the nested-loops distinct
* operator remains in the state "closed".
*/
public void close() {
if (isClosed)
return;
super.close();
if (remainder != null)
remainder.close();
cursor.close();
results.close();
}
/**
* Returns <code>true</code> if the iteration has more elements. (In other
* words, returns <code>true</code> if <code>next</code> or
* <code>peek</code> would return an element rather than throwing an
* exception.)
*
* <p>Builds a temporal bag calling <code>newBag.invoke()</code> and stores
* as much distinct elements of the input cursor in this bag as possible.
* After that all remaining elements of the input cursor are inserted in
* the queue <code>remainder</code>. With the intention to guarantee that
* no duplicate elements are inserted in the temporal bag the bag's
* <code>query</code> method verifying each element concerning the
* specified predicate is called.<br />
* An element is only inserted if the result of the <code>query</code>
* method (a cursor) is empty. At last the bag's <code>cursor</code> method
* is called and the result cursor's reference is set to this cursor. If
* the result cursor contains any elements, <code>true</code> is returned,
* otherwise <code>false</code>. If the queue <code>remainder</code>
* contains further elements the whole procedure is returned by the next
* call to this method and at this time the elements inserted in the
* temporal bag are delivered by the queue.
*
* @return <code>true</code> if the nested-loops distinct operator has more
* elements.
*/
protected boolean hasNextObject() {
if (results == null || !results.hasNext()) {
Cursor<? extends E> input;
if (initialized)
input = new QueueCursor<E>(remainder);
else
input = cursor;
Bag<E> tmpBag = newBag.invoke();
int counter = 0;
if (initialized && remainder != null)
counter = remainder.size();
while((!initialized && input.hasNext()) || (initialized && counter-- > 0)) {
E next = input.next();
if (!tmpBag.query(new RightBind<E>(predicate, next)).hasNext())
if (tmpBag.size() < maxTuples)
tmpBag.insert(next);
else {
if (remainder == null)
(remainder = newQueue.invoke()).open();
remainder.enqueue(next);
}
}
initialized = true;
results = tmpBag.cursor();
return results.hasNext();
}
return true;
}
/**
* Returns the next element in the iteration. This element will be removed
* from the iteration, if <code>next</code> is called.
*
* @return the next element in the iteration.
*/
protected E nextObject() {
E result = results.next();
results.remove();
return result;
}
/**
* Resets the nested-loops distinct operator to its initial state (optional
* operation). So the caller is able to traverse the underlying data
* structure again. The modifications, removes and updates concerning the
* underlying data structure, are still persistent. This method resets the
* input iteration, closes the result cursor, sets it to <code>null</code>
* and clears the queue <code>remainder</code>.
*
* @throws UnsupportedOperationException if the <code>reset</code>
* operation is not supported by the nested-loops distinct
* operator.
*/
public void reset() throws UnsupportedOperationException {
super.reset();
if (remainder != null)
remainder.clear();
cursor.reset();
results.close();
results = null;
initialized = false;
}
/**
* Returns <code>true</code> if the <code>reset</code> operation is
* supported by the nested-loops distinct operator. Otherwise it returns
* <code>false</code>.
*
* @return <code>true</code> if the <code>reset</code> operation is
* supported by the cursor, otherwise <code>false</code>.
*/
public boolean supportsReset() {
return cursor.supportsReset();
}
}