/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.cursors.distincts; import java.util.Iterator; import xxl.core.collections.bags.Bag; import xxl.core.collections.bags.ListBag; import xxl.core.collections.queues.ArrayQueue; import xxl.core.collections.queues.Queue; import xxl.core.cursors.AbstractCursor; import xxl.core.cursors.Cursor; import xxl.core.cursors.Cursors; import xxl.core.cursors.wrappers.QueueCursor; import xxl.core.functions.AbstractFunction; import xxl.core.functions.Function; import xxl.core.predicates.Equal; import xxl.core.predicates.Predicate; import xxl.core.predicates.RightBind; /** * A nested-loops implementation of the distinct operator, i.e., all duplicates * contained in a cursor will be removed. Depending on the specified memory * size and object size as many elements of the input cursor as possible will * be inserted into a temporal bag (typically located in main memory). To * guarantee that no duplicates will be inserted into it, the bag is searched * for duplicates with the help of a user defined predicate. If not all * elements can be inserted into the bag they will be temporarily stored in a * queue, that is typically resided in external memory, and will be inserted * when the bag has been emptied due to calls to the <code>next</code> method * of this class. * * <p><b>Example usage (1):</b> * <code><pre> * NestedLoopsDistinct<Integer> distinct = new NestedLoopsDistinct<Integer>( * new DiscreteRandomNumber(new JavaDiscreteRandomWrapper(21), 30), * 32, * 4 * ); * * distinct.open(); * * while(distinct.hasNext()) * System.out.println(distinct.next()); * * distinct.close(); * </pre></code> * The input cursor shown in this example deliveres 30 randomly distributed * integer numbers contained in the interval [0, 20]. The main memory size is * set to 32 bytes and the object size to 4 bytes. So the * {@link ListBag list-bag} generated by the default factory method that is * used in this case as default is able to hold a maximum 3 elements in main * memory. The remaining one element, that fits in main memory, is reserved for * the comparison of two elements. Due to the fact that not all elements of the * input cursor can be stored in the temporal main memory bag, the remaining * elements will be stored in an {@link ArrayQueue array-queue} returned by the * factory method that is used as default. Normally this queue should be placed * in external memory. Running this example shows that no duplicates are * returned by this operator.</p> * * <p><b>Note:</b> If an input iteration is given by an object of the class * {@link Iterator}, i.e., it does not support the <code>peek</code> operation, * it is internally wrapped to a cursor.</p> * * @param <E> the type of the elements returned by this distinct operator. * @see java.util.Iterator * @see xxl.core.cursors.Cursor * @see xxl.core.collections.bags.Bag * @see xxl.core.collections.queues.Queue * @see xxl.core.cursors.distincts.SortBasedDistinct * @see xxl.core.relational.cursors.NestedLoopsDistinct */ public class NestedLoopsDistinct<E> extends AbstractCursor<E> { /** * The input cursor delivering the elements for the distinct operation. */ protected Cursor<? extends E> cursor; /** * The result cursor delivering only distinct elements of the input cursor. */ protected Cursor<E> results = null; /** * A queue used to store the remaining elements during the operation if not * all elements can be stored in the temporal main memory bag. */ protected Queue<E> remainder = null; /** * A parameterless function returning an empty bag on demand. */ protected Function<?, ? extends Bag<E>> newBag; /** * A parameterless function returning an empty queue on demand. */ protected Function<?, ? extends Queue<E>> newQueue; /** * An unary predicate determining if two elements are equal. If the * returned value is <code>true</code> the given element and the next * element of the internal used cursor that holds all remaining elements do * match. */ protected Predicate<? super E> predicate; /** * The maximum number of elements that can be stored in the bag returned by * the function <code>newBag</code>. */ protected int maxTuples; /** * An internal used flag signaling if the elements inserted into the bag * are delivered from the input cursor or the queue <code>remainder</code>. */ protected boolean initialized = false; /** * Creates a new instance of the nested-loops distinct operator. The input * iterator is wrapped to a cursor. Determines the maximum number of * elements that can be stored in the bag used for the temporal storage of * the elements of the input cursor: * <pre> * maxTuples = memSize / objectSize - 1 * </pre> * * @param input the input iterator delivering the elements. * @param memSize the maximum amount of available main memory (bytes) for * the bag. * @param objectSize the size (bytes) needed to store one element. * @param predicate the binary predicate returning <code>true</code> if two * elements are equal. * @param newBag a parameterless function returning an empty bag. * @param newQueue a parameterless function returning an empty queue. * @throws IllegalArgumentException if not enough main memory is available. */ public NestedLoopsDistinct(Iterator<? extends E> input, int memSize, int objectSize, Predicate<? super E> predicate, Function<?, ? extends Bag<E>> newBag, Function<?, ? extends Queue<E>> newQueue) throws IllegalArgumentException { this.cursor = Cursors.wrap(input); this.newBag = newBag; this.newQueue = newQueue; this.predicate = predicate; this.maxTuples = memSize / objectSize - 1; if (memSize < 2*objectSize) throw new IllegalArgumentException("insufficient main memory available."); } /** * Creates a new instance of the nested-loops distinct operator. The input * iterator is wrapped to a cursor. Determines the maximum number of * elements that can be stored in the bag used for the temporal storage of * the elements of the input cursor: * <pre> * maxTuples = memSize / objectSize - 1 * </pre> * Uses default factory methods for list-bags and array-queues. Determines * the equality between two elements with the help of the default instance * of the predicate {@link Equal}. * * @param input the input iterator delivering the elements. * @param memSize the maximum amount of available main memory (bytes) for * the bag. * @param objectSize the size (bytes) needed to store one element. * @throws IllegalArgumentException if not enough main memory is available. */ public NestedLoopsDistinct(Iterator<? extends E> input, int memSize, int objectSize) throws IllegalArgumentException { this( input, memSize, objectSize, Equal.DEFAULT_INSTANCE, new AbstractFunction<Object, ListBag<E>>() { public ListBag<E> invoke() { return new ListBag<E>(); } }, new AbstractFunction<Object, ArrayQueue<E>>() { public ArrayQueue<E> invoke() { return new ArrayQueue<E>(); } } ); } /** * Opens the nested-loops distinct operator, i.e., signals the cursor to * reserve resources, open the input iteration, etc. Before a cursor has * been opened calls to methods like <code>next</code> or <code>peek</code> * are not guaranteed to yield proper results. Therefore <code>open</code> * must be called before a cursor's data can be processed. Multiple calls * to <code>open</code> do not have any effect, i.e., if <code>open</code> * was called the cursor remains in the state <i>opened</i> until its * <code>close</code> method is called. * * <p>Note, that a call to the <code>open</code> method of a closed cursor * usually does not open it again because of the fact that its state * generally cannot be restored when resources are released respectively * files are closed.</p> */ public void open() { if (isOpened) return; super.open(); cursor.open(); } /** * Closes the nested-loops distinct operator. Signals the operator to clean * up resources, close the input cursor as well as the internally used bag * and queue. After a call to <code>close()</code> calls to methods like * <code>next</code> or <code>peek</code> are not guaranteed to yield * proper results. Multiple calls to <code>close</code> do not have any * effect, i.e., if <code>close</code> was called the nested-loops distinct * operator remains in the state "closed". */ public void close() { if (isClosed) return; super.close(); if (remainder != null) remainder.close(); cursor.close(); results.close(); } /** * Returns <code>true</code> if the iteration has more elements. (In other * words, returns <code>true</code> if <code>next</code> or * <code>peek</code> would return an element rather than throwing an * exception.) * * <p>Builds a temporal bag calling <code>newBag.invoke()</code> and stores * as much distinct elements of the input cursor in this bag as possible. * After that all remaining elements of the input cursor are inserted in * the queue <code>remainder</code>. With the intention to guarantee that * no duplicate elements are inserted in the temporal bag the bag's * <code>query</code> method verifying each element concerning the * specified predicate is called.<br /> * An element is only inserted if the result of the <code>query</code> * method (a cursor) is empty. At last the bag's <code>cursor</code> method * is called and the result cursor's reference is set to this cursor. If * the result cursor contains any elements, <code>true</code> is returned, * otherwise <code>false</code>. If the queue <code>remainder</code> * contains further elements the whole procedure is returned by the next * call to this method and at this time the elements inserted in the * temporal bag are delivered by the queue. * * @return <code>true</code> if the nested-loops distinct operator has more * elements. */ protected boolean hasNextObject() { if (results == null || !results.hasNext()) { Cursor<? extends E> input; if (initialized) input = new QueueCursor<E>(remainder); else input = cursor; Bag<E> tmpBag = newBag.invoke(); int counter = 0; if (initialized && remainder != null) counter = remainder.size(); while((!initialized && input.hasNext()) || (initialized && counter-- > 0)) { E next = input.next(); if (!tmpBag.query(new RightBind<E>(predicate, next)).hasNext()) if (tmpBag.size() < maxTuples) tmpBag.insert(next); else { if (remainder == null) (remainder = newQueue.invoke()).open(); remainder.enqueue(next); } } initialized = true; results = tmpBag.cursor(); return results.hasNext(); } return true; } /** * Returns the next element in the iteration. This element will be removed * from the iteration, if <code>next</code> is called. * * @return the next element in the iteration. */ protected E nextObject() { E result = results.next(); results.remove(); return result; } /** * Resets the nested-loops distinct operator to its initial state (optional * operation). So the caller is able to traverse the underlying data * structure again. The modifications, removes and updates concerning the * underlying data structure, are still persistent. This method resets the * input iteration, closes the result cursor, sets it to <code>null</code> * and clears the queue <code>remainder</code>. * * @throws UnsupportedOperationException if the <code>reset</code> * operation is not supported by the nested-loops distinct * operator. */ public void reset() throws UnsupportedOperationException { super.reset(); if (remainder != null) remainder.clear(); cursor.reset(); results.close(); results = null; initialized = false; } /** * Returns <code>true</code> if the <code>reset</code> operation is * supported by the nested-loops distinct operator. Otherwise it returns * <code>false</code>. * * @return <code>true</code> if the <code>reset</code> operation is * supported by the cursor, otherwise <code>false</code>. */ public boolean supportsReset() { return cursor.supportsReset(); } }