/*
* Copyright (C) 2011 Clearspring Technologies, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.streaminer.stream.frequency.topk;
import org.streaminer.stream.frequency.util.ScoredItem;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import org.streaminer.stream.frequency.util.CountEntry;
/**
* Based on the <i>Space-Saving</i> algorithm and the <i>Stream-Summary</i>
* data structure as described in:
* <i>Efficient Computation of Frequent and Top-k Elements in Data Streams</i>
* by Metwally, Agrawal, and Abbadi
*
* Ideally used in multithreaded applications, otherwise see {@link StreamSummary}
*
* @param <T> type of data in the stream to be summarized
* @author Eric Vlaanderen
*/
public class ConcurrentStreamSummary<T> implements ITopK<T> {
private final int capacity;
private final ConcurrentHashMap<T, ScoredItem> itemMap;
private final AtomicReference<ScoredItem> minVal;
private final AtomicLong size;
private final AtomicBoolean reachCapacity;
public ConcurrentStreamSummary(final int capacity) {
this.capacity = capacity;
this.minVal = new AtomicReference<ScoredItem>();
this.size = new AtomicLong(0);
this.itemMap = new ConcurrentHashMap<T, ScoredItem>(capacity);
this.reachCapacity = new AtomicBoolean(false);
}
@Override
public boolean add(final T element) {
return add(element, 1);
}
@Override
public boolean add(final T element, final long incrementCount) {
long val = incrementCount;
ScoredItem value = new ScoredItem(element, incrementCount);
ScoredItem oldVal = itemMap.putIfAbsent(element, value);
if (oldVal != null) {
val = oldVal.addAndGetCount(incrementCount);
} else if (reachCapacity.get() || size.incrementAndGet() > capacity) {
reachCapacity.set(true);
ScoredItem oldMinVal = minVal.getAndSet(value);
itemMap.remove(oldMinVal.getItem());
while (oldMinVal.isNewItem()) {
// Wait for the oldMinVal so its error and value are completely up to date.
// no thread.sleep here due to the overhead of calling it - the waiting time will be microseconds.
}
long count = oldMinVal.getCount();
value.addAndGetCount(count);
value.setError(count);
}
value.setNewItem(false);
minVal.set(getMinValue());
return val != incrementCount;
}
private ScoredItem getMinValue() {
ScoredItem<T> minVal = null;
for (ScoredItem<T> entry : itemMap.values()) {
if (minVal == null || (!entry.isNewItem() && entry.getCount() < minVal.getCount())) {
minVal = entry;
}
}
return minVal;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[");
for (ScoredItem entry : itemMap.values())
{
sb.append("("+ entry.getCount() + ": " + entry.getItem() + ", e: " + entry.getError() + "),");
}
sb.deleteCharAt(sb.length() - 1);
sb.append("]");
return sb.toString();
}
@Override
public List<CountEntry<T>> peek(final int k) {
List<CountEntry<T>> toReturn = new ArrayList<CountEntry<T>>(k);
List<ScoredItem<T>> values = peekWithScores(k);
for (ScoredItem<T> value : values) {
toReturn.add(new CountEntry<T>(value.getItem(), value.getCount()));
}
Collections.sort(toReturn);
return toReturn;
}
public long size() {
return size.get();
}
public List<ScoredItem<T>> peekWithScores(final int k) {
List<ScoredItem<T>> values = new ArrayList<ScoredItem<T>>();
for (Map.Entry<T, ScoredItem> entry : itemMap.entrySet())
{
ScoredItem value = entry.getValue();
values.add(new ScoredItem(value.getItem(), value.getCount(), value.getError()));
}
Collections.sort(values);
values = values.size() > k ? values.subList(0, k) : values;
return values;
}
}