/**
* Copyright (C) 2011 Clearspring Technologies, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.streaminer.stream.frequency.topk;
import org.streaminer.stream.frequency.util.Counter;
import java.io.ByteArrayInputStream;
import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.streaminer.util.DoublyLinkedList;
import org.streaminer.util.ExternalizableUtil;
import org.streaminer.util.ListNode2;
import org.streaminer.util.Pair;
import java.util.Collections;
import org.streaminer.stream.frequency.util.CountEntry;
/**
* Based on the <i>Space-Saving</i> algorithm and the <i>Stream-Summary</i>
* data structure as described in:
* <i>Efficient Computation of Frequent and Top-k Elements in Data Streams</i>
* by Metwally, Agrawal, and Abbadi
*
* @param <T> type of data in the stream to be summarized
*/
public class StreamSummary<T> implements ITopK<T> {
protected int capacity;
private HashMap<T, ListNode2<Counter<T>>> counterMap;
protected DoublyLinkedList<Bucket> bucketList;
/**
* For de-serialization
*/
public StreamSummary() {
}
/**
* @param capacity maximum size (larger capacities improve accuracy)
*/
public StreamSummary(int capacity) {
this.capacity = capacity;
counterMap = new HashMap<T, ListNode2<Counter<T>>>();
bucketList = new DoublyLinkedList<Bucket>();
}
public int getCapacity() {
return capacity;
}
/**
* Algorithm: <i>Space-Saving</i>
*
* @param item stream element (<i>e</i>)
* @return false if item was already in the stream summary, true otherwise
*/
@Override
public boolean add(T item) {
return add(item, 1);
}
/**
* Algorithm: <i>Space-Saving</i>
*
* @param element
* @return false if item was already in the stream summary, true otherwise
*/
@Override
public boolean add(T element, long incrementCount) {
return offerReturnAll(element, incrementCount).left;
}
/**
* @param item stream element (<i>e</i>)
* @param incrementCount
* @return item dropped from summary if an item was dropped, null otherwise
*/
public T offerReturnDropped(T item, int incrementCount) {
return offerReturnAll(item, incrementCount).right;
}
/**
* @param item stream element (<i>e</i>)
* @param incrementCount
* @return Pair<isNewItem, itemDropped> where isNewItem is the return value of offer() and itemDropped is null if no item was dropped
*/
public Pair<Boolean, T> offerReturnAll(T item, long incrementCount) {
ListNode2<Counter<T>> counterNode = counterMap.get(item);
boolean isNewItem = (counterNode == null);
T droppedItem = null;
if (isNewItem) {
if (size() < capacity) {
counterNode = bucketList.enqueue(new Bucket(0)).getValue().counterList.add(new Counter<T>(bucketList.tail(), item));
} else {
Bucket min = bucketList.first();
counterNode = min.counterList.tail();
Counter<T> counter = counterNode.getValue();
droppedItem = counter.getItem();
counterMap.remove(droppedItem);
counter.setItem(item);
counter.setError(min.count);
}
counterMap.put(item, counterNode);
}
incrementCounter(counterNode, incrementCount);
return new Pair<Boolean, T>(isNewItem, droppedItem);
}
protected void incrementCounter(ListNode2<Counter<T>> counterNode, long incrementCount) {
Counter<T> counter = counterNode.getValue(); // count_i
ListNode2<Bucket> oldNode = counter.getBucketNode();
Bucket bucket = oldNode.getValue(); // Let Bucket_i be the bucket of count_i
bucket.counterList.remove(counterNode); // Detach count_i from Bucket_i's child-list
counter.incrementCount(incrementCount);
// Finding the right bucket for count_i
// Because we allow a single call to increment count more than once, this may not be the adjacent bucket.
ListNode2<Bucket> bucketNodePrev = oldNode;
ListNode2<Bucket> bucketNodeNext = bucketNodePrev.getNext();
while (bucketNodeNext != null) {
Bucket bucketNext = bucketNodeNext.getValue(); // Let Bucket_i^+ be Bucket_i's neighbor of larger value
if (counter.getCount() == bucketNext.count) {
bucketNext.counterList.add(counterNode); // Attach count_i to Bucket_i^+'s child-list
break;
} else if (counter.getCount() > bucketNext.count) {
bucketNodePrev = bucketNodeNext;
bucketNodeNext = bucketNodePrev.getNext(); // Continue hunting for an appropriate bucket
} else {
// A new bucket has to be created
bucketNodeNext = null;
}
}
if (bucketNodeNext == null) {
Bucket bucketNext = new Bucket(counter.getCount());
bucketNext.counterList.add(counterNode);
bucketNodeNext = bucketList.addAfter(bucketNodePrev, bucketNext);
}
counter.setBucketNode(bucketNodeNext);
//Cleaning up
if (bucket.counterList.isEmpty()) // If Bucket_i's child-list is empty
{
bucketList.remove(oldNode); // Detach Bucket_i from the Stream-Summary
}
}
@Override
public List<CountEntry<T>> peek(int k) {
List<CountEntry<T>> list = new ArrayList<CountEntry<T>>(k);
for (ListNode2<Bucket> bNode = bucketList.head(); bNode != null; bNode = bNode.getPrev()) {
Bucket b = bNode.getValue();
for (Counter<T> c : b.counterList) {
if (list.size() == k) {
Collections.sort(list);
return list;
}
list.add(new CountEntry<T>(c.getItem(), c.getCount()));
}
}
Collections.sort(list);
return list;
}
public List<Counter<T>> topK(int k) {
List<Counter<T>> topK = new ArrayList<Counter<T>>(k);
for (ListNode2<Bucket> bNode = bucketList.head(); bNode != null; bNode = bNode.getPrev()) {
Bucket b = bNode.getValue();
for (Counter<T> c : b.counterList) {
if (topK.size() == k) {
return topK;
}
topK.add(c);
}
}
return topK;
}
/**
* @return number of items stored
*/
public long size() {
return counterMap.size();
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append('[');
for (ListNode2<Bucket> bNode = bucketList.head(); bNode != null; bNode = bNode.getPrev()) {
Bucket b = bNode.getValue();
sb.append('{');
sb.append(b.count);
sb.append(":[");
for (Counter<T> c : b.counterList) {
sb.append('{');
sb.append(c.getItem());
sb.append(':');
sb.append(c.getError());
sb.append("},");
}
if (b.counterList.size() > 0) {
sb.deleteCharAt(sb.length() - 1);
}
sb.append("]},");
}
if (bucketList.size() > 0) {
sb.deleteCharAt(sb.length() - 1);
}
sb.append(']');
return sb.toString();
}
public class Bucket {
protected DoublyLinkedList<Counter<T>> counterList;
private long count;
public Bucket(long count) {
this.count = count;
this.counterList = new DoublyLinkedList<Counter<T>>();
}
}
}