/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.cursors.groupers; import java.util.Iterator; import java.util.Map; import xxl.core.collections.bags.Bag; import xxl.core.collections.bags.ListBag; import xxl.core.collections.queues.ArrayQueue; import xxl.core.collections.queues.Queue; import xxl.core.cursors.AbstractCursor; import xxl.core.cursors.Cursor; import xxl.core.cursors.Cursors; import xxl.core.cursors.wrappers.QueueCursor; import xxl.core.functions.AbstractFunction; import xxl.core.functions.Function; /** * A nested-loops implementation of the group operator, i.e., all elements of * the input iteration get partitioned according to a user defined function. * Depending on the specified memory size and object size as many elements as * possible will be inserted into temporal bags. Each bag refers to a special * key stored in a {@link Map map} in main memory. If enough main memory is * available the bags used to store the elements may also reside in main * memory, otherwise they should be located on external memory. A key, a bag * refers to, is returned when the user defined unary mapping function is * applied to an element of the input iteration. If the map's size gets larger * than * <pre> * maxTuples = ((memSize - objectSize) / keySize) - 1 * </pre> * the remaining elements of the input iteration are temporary stored in a * queue that is typically resided in external memory. * * <p><b>Note:</b> If the input iteration is given by an object of the class * {@link Iterator}, i.e., it does not support the <code>peek</code> operation, * it is internally wrapped to a cursor.</p> * * <p><b>Example usage (1):</b> * <code><pre> * NestedLoopsGrouper<Integer> grouper = new NestedLoopsGrouper<Integer>( * new Enumerator(21), * new Function<Integer, Integer>() { * public Integer invoke(Integer next) { * return next % 5; * } * }, * new TreeMap(), * 32, * 4, * 8 * ); * * grouper.open(); * * while (grouper.hasNext()) { * Cursor<Integer> nextGroup = grouper.next(); * System.out.print("Next group: "); * while (nextGroup.hasNext()) * System.out.print(nextGroup.next() + " "); * System.out.println(); * } * * grouper.close(); * </pre></code> * The enumerator shown in this example delivers the integer numbers from 0 to * 20. The main memory size is set to 32 bytes and the object size to 4 bytes. * The size of a key for an element is unrealistically set to 8 bytes. So the * {@link java.util.TreeMap tree-map} used to store the keys in main memory can * hold two elements. Therefore only two {@link ListBag list-bags} returned by * the default factory method can be allocated by a call to * <code>hasNext</code>.<br /> * Due to the fact that not all keys can be stored in the main-memory map the * remaining elements that cannot be stored in the two allocated bags will be * stored in an {@link ArrayQueue array-queue} returned by the default factory * method. Normally this queue should be placed in external memory. The mapping * function maps an element of the input iteration to a certain key. In this * case it is realized as: (integer value) modulo 5. So the output is: * <pre> * Next group: 0 5 10 15 20 * Next group: 1 6 11 16 * Next group: 2 7 12 17 * Next group: 3 8 13 18 * Next group: 4 9 14 19 * </pre></p> * * @param <E> the type of the elements returned by the input iteration. * @see java.util.Iterator * @see xxl.core.cursors.Cursor * @see xxl.core.cursors.groupers.HashGrouper * @see xxl.core.cursors.groupers.SortBasedGrouper */ public class NestedLoopsGrouper<E> extends AbstractCursor<Cursor<E>> { /** * The input iteration delivering the elements to group. */ protected Cursor<? extends E> input; /** * A cursor iterating over all bags, i.e., it contains cursors generated by * calls to <code>bag.cursor()</code>. */ protected Cursor<? extends Bag<E>> bagIterator = null; /** * A queue storing all elements of the input iteration that could not be * stored due to a lack of main-memory size. */ protected Queue<E> remainder = null; /** * A map used to store a key for each bag. Usally located in main memory. */ protected Map<Object, Bag<E>> map; /** * An unary function returning a key for each given value. */ protected Function<? super E, ? extends Object> mapping; /** * A parameterless function returning an empty bag. */ protected Function<?, ? extends Bag<E>> newBag; /** * A parameterless function returning an empty queue. */ protected Function<?, ? extends Queue<E>> newQueue; /** * The maximum number of elements (keys) that can be stored in main memory. */ protected int maxTuples; /** * A flag determining if the input is still the given input iteration or * the queue <code>remainder</code>. */ protected boolean initialized = false; /** * Creates a new nested-loops grouper. Determines the maximum number of * keys that can be stored in the main memory map: * <pre> * ((memSize - objectSize) / keySize) - 1 * </pre> * This formula is based on the assumption that only the keys, i.e., the * map, is stored in main memory whereas the bags storing the input * iteration's elements are located in external memory. * * @param input the input iteration delivering the elements to be grouped. * @param mapping an unary mapping function returning a key to a given * value. * @param map the map which is used for storing the keys in main memory. * @param memSize the maximum amount of available main memory (bytes) for * the map. * @param objectSize the size (bytes) needed to store one element. * @param keySize the size (bytes) a key needs in main memory. * @param newBag a parameterless function returning an empty bag. * @param newQueue a parameterless function returning an empty queue. * @throws IllegalArgumentException if not enough main memory is available. */ public NestedLoopsGrouper(Iterator<? extends E> input, Function<? super E, ? extends Object> mapping, Map<Object, Bag<E>> map, int memSize, int objectSize, int keySize, Function<?, ? extends Bag<E>> newBag, Function<?, ? extends Queue<E>> newQueue) throws IllegalArgumentException { this.input = Cursors.wrap(input); this.mapping = mapping; this.map = map; this.newBag = newBag; this.newQueue = newQueue; this.maxTuples = ((memSize - objectSize) / keySize) - 1; if (memSize < 2*keySize + objectSize) throw new IllegalArgumentException("insufficient main memory available."); } /** * Creates a new nested-loops grouper. Determines the maximum number of * keys that can be stored in the main memory map: * <pre> * ((memSize - objectSize) / keySize) - 1 * </pre> * This formula is based on the assumption that only the keys, i.e., the * map, is stored in main memory whereas the bags storing the input * iteration's elements are located in external memory. Uses default * factory methods for list-bags and array-queues. * * @param input the input iterator delivering the elements to be grouped. * @param mapping an unary mapping function returning a key to a given * value. * @param map the map which is used for storing the keys in main memory. * @param memSize the maximum amount of available main memory (bytes) for * the map. * @param objectSize the size (bytes) needed to store one element. * @param keySize the size (bytes) a key needs in main memory. * @throws IllegalArgumentException if not enough main memory is available. */ public NestedLoopsGrouper(Iterator<? extends E> input, Function<? super E, ? extends Object> mapping, Map<Object, Bag<E>> map, int memSize, int objectSize, int keySize) throws IllegalArgumentException { this( input, mapping, map, memSize, objectSize, keySize, new AbstractFunction<Object, ListBag<E>>() { public ListBag<E> invoke() { return new ListBag<E>(); } }, new AbstractFunction<Object, ArrayQueue<E>>() { public ArrayQueue<E> invoke() { return new ArrayQueue<E>(); } } ); } /** * Opens the nested-loops grouper, i.e., signals the cursor to reserve * resources, open the input iteration, etc. Before a cursor has been * opened calls to methods like <code>next</code> or <code>peek</code> are * not guaranteed to yield proper results. Therefore <code>open</code> must * be called before a cursor's data can be processed. Multiple calls to * <code>open</code> do not have any effect, i.e., if <code>open</code> was * called the cursor remains in the state <i>opened</i> until its * <code>close</code> method is called. * * <p>Note, that a call to the <code>open</code> method of a closed cursor * usually does not open it again because of the fact that its state * generally cannot be restored when resources are released respectively * files are closed.</p> */ public void open() { if (isOpened) return; super.open(); input.open(); } /** * Closes the nested-loops grouper. Signals it to clean up resources, * close queues and cursors, etc. After a call to <code>close</code> calls * to methods like <code>next</code> or <code>peek</code> are not * guarantied to yield proper results. Multiple calls to <code>close</code> * do not have any effect, i.e., if <code>close</code> was called the * nested-loops grouper remains in the state "closed". */ public void close() { if (isClosed) return; super.close(); input.close(); bagIterator.close(); if (remainder != null) remainder.close(); } /** * Returns <code>true</code> if the iteration has more elements. (In other * words, returns <code>true</code> if <code>next</code> or * <code>peek</code> would return an element rather than throwing an * exception.) Inserts as many elements as possible in bags determining a * bag for each element by invoking the mapping function. The returned key * is stored in the main memory map and the element is inserted in the * appropriate bag. If no bag exists for a key, a new bag is allocated. If * the map's size is greater or equal to <code>maxTuples</code> the * remaining elements of the input cursor are inserted into a temporal * queue, which delivers the input elements when the * <code>bagIterator</code> has completely been traversed. The * <code>bagIterator</code> contains the elements that will be returned by * calls to <code>next</code> or <code>peek</code>. * * @return <code>true</code> if the nested-loops grouper has more elements. */ protected boolean hasNextObject() { if (bagIterator == null || !bagIterator.hasNext()) { Cursor<? extends E> input; if (initialized) input = new QueueCursor<E>(remainder); else input = this.input; int counter = 0; if (initialized && remainder != null) counter = remainder.size(); while (!initialized && input.hasNext() || initialized && counter-- > 0) { E next = input.next(); Object key = mapping.invoke(next); if (!map.containsKey(key)) if (map.size() < maxTuples) map.put(key, newBag.invoke()); else { if (remainder == null) (remainder = newQueue.invoke()).open(); remainder.enqueue(next); continue; } Bag<E> bag = map.get(key); bag.insert(next); } initialized = true; return (bagIterator = Cursors.wrap(map.values().iterator())).hasNext(); } return true; } /** * Returns the next element in the iteration. This element will be * accessible by some of the cursor's methods, e.g., <code>update</code> or * <code>remove</code>, until a call to <code>next</code> or * <code>peek</code> occurs. This is calling <code>next</code> or * <code>peek</code> proceeds the iteration and therefore its previous * element will not be accessible any more. Returns the next group * delivered by the <code>bagIterator</code>, a cursor, and removes the * corresponding bag from the map. * * @return the next element in the iteration. */ protected Cursor<E> nextObject() { Cursor<E> result = bagIterator.next().cursor(); bagIterator.remove(); return result; } /** * Resets the cursor to its initial state such that the caller is able to * traverse the underlying data structure again without constructing a new * cursor (optional operation). The modifications, removes and updates * concerning the underlying data structure, are still persistent. * * <p>Note, that this operation is optional and might not work for all * cursors.</p> * * @throws UnsupportedOperationException if the <code>reset</code> * operation is not supported by the nested-loops grouper. */ public void reset() throws UnsupportedOperationException { super.reset(); input.reset(); bagIterator.close(); bagIterator = null; if (remainder != null) remainder.clear(); initialized = false; } /** * Returns <code>true</code> if the <code>reset</code> operation is * supported by the nested-loops grouper. Otherwise it returns * <code>false</code>. * * @return <code>true</code> if the <code>reset</code> operation is * supported by the nested-loops grouper, otherwise * <code>false</code>. */ public boolean supportsReset() { return input.supportsReset(); } }