/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.cursors.differences; import java.util.Iterator; import xxl.core.collections.bags.Bag; import xxl.core.collections.bags.ListBag; import xxl.core.cursors.AbstractCursor; import xxl.core.cursors.Cursor; import xxl.core.cursors.Cursors; import xxl.core.functions.AbstractFunction; import xxl.core.functions.Function; import xxl.core.predicates.Equal; import xxl.core.predicates.Predicate; import xxl.core.predicates.RightBind; import xxl.core.util.BitSet; /** * A nested-loops implementation of the difference operator * (<code>input1 - input2</code>). This operation can be performed in two * different ways, namely the first realization removes an element of * <code>input1</code> if the same element exists in <code>input2</code>. The * second way of processing removes all elements of <code>input1</code> that * match with an element of <code>input2</code>. This second approch implies * that no duplicates will be returned by the difference operator, whereas the * first solution may contain duplicates if the number of equal elements in * cursor <code>input1</code> is greater than that of <code>input2</code>. * * <p>The difference operator implemented by this class supports realization * depending on a boolean flag <code>all</code> that signals if all elements of * cursor <code>input1</code> that fulfill the given predicate will be removed * or only one element will be removed. As mentioned above a predicate is used * to determine if an element of <code>input2</code> matches with an element of * <code>input1</code>. If no predicate has been specified, internally the * {@link Equal equality} predicate is used by default. The function * <code>newBag</code> generates an empty bag each time it is invoked. This bag * should reside in main memory and contains as much elements of * <code>input1</code> as possible. Each element of <code>input2</code> gets * checked for a match with an element contained in this bag. The size of the * bag depends on the specified arguments <code>memSize</code> and * <code>objectSize</code>. The maximum number of elements this bag is able to * contain is computed by the formula:<br /> * <pre> * maxTuples = memSize / objectSize - 1 * </pre> * One element is subtracted due to the reason that a minimum of one element of * cursor <code>input2</code> has also to be located in main memory.</p> * * <p><b>Example usage (1):</b> * <code><pre> * NestedLoopsDifference<Integer> difference = new NestedLoopsDifference<Integer>( * new Enumerator(21), * new Filter<Integer>( * new Enumerator(21), * new Predicate<Integer>() { * public boolean invoke(Integer next) { * return next % 2 == 0; * } * } * ), * 32, * 8, * Bag.FACTORY_METHOD, * new Predicate<Integer>() { * public boolean invoke(Integer previous, Integer next) { * return previous == next; * } * }, * false * ); * * difference.open(); * * while (difference.hasNext()) * System.out.println(difference.next()); * * difference.close(); * </pre></code> * This nested-loops difference substracts all even numbers contained in the * interval [0, 21) from all numbers of the same interval (input1). The * available mememory size is set to 32 bytes and an object has the size of 8 * bytes. So a maximum of 3 elements can be stored in the temporal main memory * bag. The FACTORY_METHOD of the class {@link Bag} delivers a new empty bag, * therefore a {@link xxl.core.collections.bags.ListBag list-bag} will be used * to store the elements of cursor <code>input1</code>. The specified predicate * returns <code>true</code> if an element of <code>input1</code> and an * element of <code>input2</code> are equal in their integer values. In this * example the flag <code>all</code> can be specified arbitrary due to * <code>input1</code> contains no duplicates. But if the first input cursor * would contain all elements from 0 to 20 twice the result would be the * same.</p> * * <p><b>Example usage (2):</b> * <code><pre> * difference = new NestedLoopsDifference<Integer>( * Arrays.asList(1, 2, 3, 4).iterator(), * Arrays.asList(1, 2, 3).iterator(), * 32, * 8, * new Function<Object, Iterator<Integer>>() { * public Iterator<Integer> invoke() { * return Arrays.asList(1, 2, 3).iterator(); * } * }, * false * ); * * difference.open(); * * while (difference.hasNext()) * System.out.println(difference.next()); * * difference.close(); * </pre></code> * This example computes the difference between to iterators based on the lists * {1, 2, 2, 3} and {1, 2, 3}. The memory usage is equal to that in example 1, * but in this case the flag <code>all</code> leads to different results. * <ul> * <li> * If <code>all == true</code> this operator delivers no results, * because each element of <code>input2</code> is equal to an element * of <code>input1</code>. * </li> * <li> * But if <code>all == false</code> the element '2' is returned because * it is only substracted once from <code>input1</code>. * </li> * </ul> * The function specified above resets and returns the input cursor * <code>input2</code>, because it supports no <code>reset</code> operation, * but has to be traversed several times (inner loop). The predicate comparing * two elements concerning equality is set to {@link Equal#DEFAULT_INSTANCE} by * default.</p> * * <p><b>Note:</b> If an input iteration is given by an object of the class * {@link Iterator}, i.e., it does not support the <code>peek</code> operation, * it is internally wrapped to a cursor.</p> * * @param <E> the type of the elements returned by this difference. * @see java.util.Iterator * @see xxl.core.cursors.Cursor * @see xxl.core.functions.Function * @see xxl.core.predicates.Predicate * @see xxl.core.predicates.Equal * @see xxl.core.collections.bags.Bag * @see xxl.core.cursors.differences.SortBasedDifference * @see xxl.core.relational.cursors.NestedLoopsDifference */ public class NestedLoopsDifference<E> extends AbstractCursor<E> { /** * The first (or left) input cursor of the difference operator. */ protected Cursor<E> input1; /** * The second (or right) input cursor of the difference operator. */ protected Cursor<? extends E> input2; /** * The output cursor of the difference operator. */ protected Cursor<E> results = null; /** * A parameterless function returning a new bag on demand. */ protected Function<?, ? extends Bag<E>> newBag; /** * The maximum number of elements that can be stored in the bag returned by * the function <code>newBag</code>. */ protected int maxTuples; /** * The predicate used to determine a match between an element of * <code>input2</code> and an element of <code>input1</code>. */ protected Predicate<? super E> predicate; /** * A flag signaling if all matches or only one matche returned by the * predicate should be removed from the internal bag. */ protected boolean all; /** * A parameterless function used to reset and return the second input * cursor. */ protected Function<?, ? extends Cursor<? extends E>> resetInput2; /** * A bit set storing which elements of cursor <code>input2</code> have been * removed from cursor <code>input1</code> already. */ protected BitSet removedElements; /** * Creates a new instance of the nested-loops difference operator. Every * input iterator is wrapped to a cursor. Determines the maximum number of * elements that can be stored in the bag used for the temporal storage of * the elements of <code>input1</code> in main memory: * <pre> * maxTuples = memSize / objectSize - 1 * </pre> * This constructor should only be used if cursor <code>input2</code> is * not resetable. * * @param input1 the first input iterator where the elements have to be * subtracted from. * @param input2 the second input iterator containing the elements that * have to be subtracted. * @param memSize the maximum amount of available main memory that can be * used for the bag. * @param objectSize the size (bytes) needed to store one object of an * input cursor. * @param newBag a parameterless function delivering an empty bag on * demand. This bag is used to store the elements of cursor * <code>input1</code>. * @param resetInput2 a parameterless function that delivers the second * input cursor again. This constructor should only be used if the * second input cursor does not support the <code>reset</code> * functionality. * @param predicate a binaray predicate that has to determine a match * between an element of <code>input1</code> and an element of * <code>input2</code>. * @param all a boolean flag signaling if all elements contained in the bag * that have a positiv match concerning the predicate will be * removed or only one element will be removed. * @throws IllegalArgumentException if not enough main memory is available. */ public NestedLoopsDifference(Iterator<E> input1, Iterator<? extends E> input2, int memSize, int objectSize, Function<?, ? extends Bag<E>> newBag, final Function<?, ? extends Iterator<? extends E>> resetInput2, Predicate<? super E> predicate, boolean all) { this.input1 = Cursors.wrap(input1); this.input2 = Cursors.wrap(input2); this.newBag = newBag; this.resetInput2 = new AbstractFunction<Object, Cursor<? extends E>>() { public Cursor<? extends E> invoke() { return Cursors.wrap(resetInput2.invoke()); } }; this.predicate = predicate; this.all = all; this.maxTuples = memSize / objectSize - 1; if (memSize < 2*objectSize) throw new IllegalArgumentException("Insufficient main memory available."); if (!all) { int counter = 0; for ( ; input2.hasNext(); counter++) input2.next(); removedElements = new BitSet(counter); input2 = this.resetInput2.invoke(); } } /** * Creates a new instance of the nested-loops difference operator. Every * input iterator is wrapped to a cursor. Determines the maximum number of * elements that can be stored in the bag used for the temporal storage of * the elements of <code>input1</code> in main memory: * <pre> * maxTuples = memSize / objectSize - 1 * </pre> * Uses the factory method for bags, * {@link xxl.core.collections.bags.Bag#FACTORY_METHOD}. Determines the * equality between an element of <code>input1</code> and * <code>input2</code> with the help of the default instance of the * {@link Equal equality} predicate. This constructor should only be used * if cursor <code>input2</code> is not resetable. * * @param input1 the first input iterator where the elements have to be * subtracted from. * @param input2 the second input iterator containing the elements that * have to be subtracted. * @param memSize the maximum amount of available main memory that can be * used for the bag. * @param objectSize the size (bytes) needed to store one object of an * input cursor. * @param resetInput2 a parameterless function that delivers the second * input cursor again. This constructor should only be used if the * second input cursor does not support the <code>reset</code> * functionality. * @param all a boolean flag signaling if all elements contained in the bag * that have a positiv match concerning the predicate will be * removed or only one element will be removed. * @throws IllegalArgumentException if not enough main memory is available. */ public NestedLoopsDifference(Iterator<E> input1, Iterator<? extends E> input2, int memSize, int objectSize, Function<?, ? extends Iterator<? extends E>> resetInput2, boolean all) { this( input1, input2, memSize, objectSize, new AbstractFunction<Object, ListBag<E>>() { public ListBag<E> invoke() { return new ListBag<E>(); } }, resetInput2, Equal.DEFAULT_INSTANCE, all ); } /** * Creates a new instance of the nested-loops difference operator. Every * input iterator is wrapped to a cursor. Determines the maximum number of * elements that can be stored in the bag used for the temporal storage of * the elements of <code>input1</code> in main memory: * <pre> * maxTuples = memSize / objectSize - 1 * </pre> * <code>Input2</code> has to support the <code>reset</code> operation, * otherwise an {@link java.lang.UnsupportedOperationException} will be * thrown! * * @param input1 the first input iterator where the elements have to be * subtracted from. * @param input2 the second input iterator containing the elements that * have to be subtracted. * @param memSize the maximum amount of available main memory that can be * used for the bag. * @param objectSize the size (bytes) needed to store one object of an * input cursor. * @param newBag a parameterless function delivering an empty bag on * demand. This bag is used to store the elements of cursor * <code>input1</code>. * @param predicate a binaray predicate that has to determine a match * between an element of <code>input1</code> and an element of * <code>input2</code>. * @param all a boolean flag signaling if all elements contained in the bag * that have a positiv match concerning the predicate will be * removed or only one element will be removed. * @throws IllegalArgumentException if not enough main memory is available. */ public NestedLoopsDifference(Iterator<E> input1, final Cursor<? extends E> input2, int memSize, int objectSize, Function<?, ? extends Bag<E>> newBag, Predicate<? super E> predicate, boolean all) { this( input1, input2, memSize, objectSize, newBag, new AbstractFunction<Object, Cursor<? extends E>>() { public Cursor<? extends E> invoke() { input2.reset(); return input2; } }, predicate, all ); } /** * Creates a new instance of the nested-loops difference operator. Every * input iterator is wrapped to a cursor. Determines the maximum number of * elements that can be stored in the bag used for the temporal storage of * the elements of <code>input1</code> in main memory: * <pre> * maxTuples = memSize / objectSize - 1 * </pre> * Uses the factory method for bags, {@link Bag#FACTORY_METHOD}. Determines * the equality between an element of <code>input1</code> and * <code>input2</code> with the help of the default instance of the * {@link Equal equality} predicate. <code>Input2</code> has to support the * <code>reset</code> operation, otherwise an * {@link java.lang.UnsupportedOperationException} will be thrown! * * @param input1 the first input iterator where the elements have to be * subtracted from. * @param input2 the second input iterator containing the elements that * have to be subtracted. * @param memSize the maximum amount of available main memory that can be * used for the bag. * @param objectSize the size (bytes) needed to store one object of an * input cursor. * @param all a boolean flag signaling if all elements contained in the bag * that have a positiv match concerning the predicate will be * removed or only one element will be removed. * @throws IllegalArgumentException if not enough main memory is available. */ public NestedLoopsDifference(Iterator<E> input1, Cursor<? extends E> input2, int memSize, int objectSize, boolean all) { this( input1, input2, memSize, objectSize, new AbstractFunction<Object, ListBag<E>>() { public ListBag<E> invoke() { return new ListBag<E>(); } }, Equal.DEFAULT_INSTANCE, all ); } /** * Opens the nested-loops difference operator, i.e., signals the cursor to * reserve resources, open input iterations, etc. Before a cursor has been * opened calls to methods like <code>next</code> or <code>peek</code> are * not guaranteed to yield proper results. Therefore <code>open</code> must * be called before a cursor's data can be processed. Multiple calls to * <code>open</code> do not have any effect, i.e., if <code>open</code> was * called the cursor remains in the state <i>opened</i> until its * <code>close</code> method is called. * * <p>Note, that a call to the <code>open</code> method of a closed cursor * usually does not open it again because of the fact that its state * generally cannot be restored when resources are released respectively * files are closed.</p> */ public void open() { if (isOpened) return; super.open(); input1.open(); input2.open(); } /** * Closes the nested-loops difference operator, i.e., signals the cursor to * clean up resources and close its input and output cursors. When a cursor * has been closed calls to methods like <code>next</code> or * <code>peek</code> are not guaranteed to yield proper results. Multiple * calls to <code>close</code> do not have any effect, i.e., if * <code>close</code> was called the cursor remains in the state * <i>closed</i>. * * <p>Note, that a closed cursor usually cannot be opened again because of * the fact that its state generally cannot be restored when resources are * released respectively files are closed.</p> */ public void close() { if (isClosed) return; super.close(); input1.close(); input2.close(); results.close(); } /** * Returns <code>true</code> if the iteration has more elements. (In other * words, returns <code>true</code> if <code>next</code> or * <code>peek</code> would return an element rather than throwing an * exception.) * * <p>Builds a temporal bag calling <code>newBag.invoke()</code> and stores * as much elements of cursor <code>input1</code> in this bag as possible. * After that each element of the second input cursor is taken and with the * help of the bag's <code>query</code> method a cursor containing all * elements that have to be removed from <code>input1</code> are * determined. Depending on the flag <code>all</code> all elements * contained in that cursor are removed from the bag or only one element is * removed. At last the bag's <code>cursor</code> method is called and the * result cursor's reference is set to this cursor. If the result cursor * contains any elements, <code>true</code> is returned, otherwise * <code>false</code>. If the cursor <code>input1</code> contains further * elements the whole procedure is returned. * * @return <code>true</code> if the nested-loops difference operator has * more elements. */ protected boolean hasNextObject() { if (results == null || !results.hasNext()) { Bag<E> tmpBag = newBag.invoke(); while (input1.hasNext()) { if (tmpBag.size() < maxTuples) tmpBag.insert(input1.next()); Cursor<E> tmpCursor; int position = 0; while (input2.hasNext()) { tmpCursor = tmpBag.query(new RightBind<E>(predicate, input2.peek())); while (tmpCursor.hasNext()) { tmpCursor.next(); if (!all) { if (!removedElements.get(position)) { tmpCursor.remove(); removedElements.set(position); } break; } else tmpCursor.remove(); } if (input2.hasNext()) { input2.next(); position++; } } input2 = resetInput2.invoke(); if (tmpBag.size() == maxTuples) break; } results = tmpBag.cursor(); return results.hasNext(); } return true; } /** * Returns the next element in the iteration. This element will be removed * from the iteration, if <code>next</code> is called. This method returns * the next element of the result cursor and removes it from the underlying * bag. * * @return the next element in the iteration. */ protected E nextObject() { E result = results.next(); results.remove(); return result; } /** * Removes from the underlying data structure the last element returned by * the nested-loops difference operator (optional operation). This method * can be called only once per call to <code>next</code> or * <code>peek</code> and removes the element returned by this method. Note, * that between a call to <code>next</code> and <code>remove</code> the * invocation of <code>peek</code> or <code>hasNext</code> is forbidden. * The behaviour of a cursor is unspecified if the underlying data * structure is modified while the iteration is in progress in any way * other than by calling this method. This method is only supported if the * bag's size is limited to only one element, otherwise an * {@link java.lang.UnsupportedOperationException} will be thrown. * * @throws IllegalStateException if the <code>next</code> or * <code>peek</code> method has not yet been called, or the * <code>remove</code> method has already been called after the * last call to the <code>next</code> or <code>peek</code> method. * @throws UnsupportedOperationException if the <code>remove</code> * operation is not supported by the nested-loops difference * operator, i.e., thebag's size is greater than 1. */ public void remove() throws IllegalStateException, UnsupportedOperationException { super.remove(); results.remove(); input1.remove(); } /** * Returns <code>true</code> if the <code>remove</code> operation is * supported by the nested-loops difference operator. Otherwise it returns * <code>false</code>. * * @return <code>true</code> if the <code>remove</code> operation is * supported by the cursor, otherwise <code>false</code>. */ public boolean supportsRemove() { return maxTuples == 1 && input1.supportsRemove(); } /** * Replaces the object that was returned by the last call to * <code>next</code> or <code>peek</code> (optional operation). This * operation must not be called after a call to <code>hasNext</code>. It * should follow a call to <code>next</code> or <code>peek</code>. This * method should be called only once per call to <code>next</code> or * <code>peek</code>. The behaviour of a nested-loops difference operator * is unspecified if the underlying data structure is modified while the * iteration is in progress in any way other than by calling this method. * This method is only supported if the bag's size is limited to only one * element, otherwise an {@link java.lang.UnsupportedOperationException} * will be thrown. * * @param object the object that replaces the object returned by the last * call to <code>next</code> or <code>peek</code>. * @throws IllegalStateException if the <code>next</code> or * <code>peek</code> method has not yet been called, or the * <code>update</code> method has already been called after the * last call to the <code>next</code> or <code>peek</code> method. * @throws UnsupportedOperationException if the <code>update</code> * operation is not supported by the nested-loops difference * operator, i.e., the bag's size is greater than 1. */ public void update(E object) throws IllegalStateException, UnsupportedOperationException { super.update(object); results.update(object); input1.update(object); } /** * Returns <code>true</code> if the <code>update</code> operation is * supported by the nested-loops difference operator. Otherwise it returns * <code>false</code>. * * @return <code>true</code> if the <code>update</code> operation is * supported by the cursor, otherwise <code>false</code>. */ public boolean supportsUpdate() { return maxTuples == 1 && input1.supportsRemove(); } /** * Resets the nested-loops difference operator to its initial state * (optional operation). So the caller is able to traverse the underlying * data structure again. The modifications, removes and updates concerning * the underlying data structure, are still persistent. This method resets * the input cursors, closes the result cursor and sets it to * <code>null</code>. * * @throws UnsupportedOperationException if the <code>reset</code> * operation is not supported by the nested-loops difference * operator. */ public void reset() throws UnsupportedOperationException { super.reset(); input1.reset(); input2.reset(); results.close(); results = null; } /** * Returns <code>true</code> if the <code>reset</code> operation is * supported by the nested-loops difference operator. Otherwise it returns * <code>false</code>. * * @return <code>true</code> if the <code>reset</code> operation is * supported by the cursor, otherwise <code>false</code>. */ public boolean supportsReset() { return input1.supportsReset() && input2.supportsReset(); } }