/* * Copyright (C) 2011 Clearspring Technologies, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.streaminer.stream.frequency.topk; import org.streaminer.stream.frequency.util.ScoredItem; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import org.streaminer.stream.frequency.util.CountEntry; /** * Based on the <i>Space-Saving</i> algorithm and the <i>Stream-Summary</i> * data structure as described in: * <i>Efficient Computation of Frequent and Top-k Elements in Data Streams</i> * by Metwally, Agrawal, and Abbadi * * Ideally used in multithreaded applications, otherwise see {@link StreamSummary} * * @param <T> type of data in the stream to be summarized * @author Eric Vlaanderen */ public class ConcurrentStreamSummary<T> implements ITopK<T> { private final int capacity; private final ConcurrentHashMap<T, ScoredItem> itemMap; private final AtomicReference<ScoredItem> minVal; private final AtomicLong size; private final AtomicBoolean reachCapacity; public ConcurrentStreamSummary(final int capacity) { this.capacity = capacity; this.minVal = new AtomicReference<ScoredItem>(); this.size = new AtomicLong(0); this.itemMap = new ConcurrentHashMap<T, ScoredItem>(capacity); this.reachCapacity = new AtomicBoolean(false); } @Override public boolean add(final T element) { return add(element, 1); } @Override public boolean add(final T element, final long incrementCount) { long val = incrementCount; ScoredItem value = new ScoredItem(element, incrementCount); ScoredItem oldVal = itemMap.putIfAbsent(element, value); if (oldVal != null) { val = oldVal.addAndGetCount(incrementCount); } else if (reachCapacity.get() || size.incrementAndGet() > capacity) { reachCapacity.set(true); ScoredItem oldMinVal = minVal.getAndSet(value); itemMap.remove(oldMinVal.getItem()); while (oldMinVal.isNewItem()) { // Wait for the oldMinVal so its error and value are completely up to date. // no thread.sleep here due to the overhead of calling it - the waiting time will be microseconds. } long count = oldMinVal.getCount(); value.addAndGetCount(count); value.setError(count); } value.setNewItem(false); minVal.set(getMinValue()); return val != incrementCount; } private ScoredItem getMinValue() { ScoredItem<T> minVal = null; for (ScoredItem<T> entry : itemMap.values()) { if (minVal == null || (!entry.isNewItem() && entry.getCount() < minVal.getCount())) { minVal = entry; } } return minVal; } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("["); for (ScoredItem entry : itemMap.values()) { sb.append("("+ entry.getCount() + ": " + entry.getItem() + ", e: " + entry.getError() + "),"); } sb.deleteCharAt(sb.length() - 1); sb.append("]"); return sb.toString(); } @Override public List<CountEntry<T>> peek(final int k) { List<CountEntry<T>> toReturn = new ArrayList<CountEntry<T>>(k); List<ScoredItem<T>> values = peekWithScores(k); for (ScoredItem<T> value : values) { toReturn.add(new CountEntry<T>(value.getItem(), value.getCount())); } Collections.sort(toReturn); return toReturn; } public long size() { return size.get(); } public List<ScoredItem<T>> peekWithScores(final int k) { List<ScoredItem<T>> values = new ArrayList<ScoredItem<T>>(); for (Map.Entry<T, ScoredItem> entry : itemMap.entrySet()) { ScoredItem value = entry.getValue(); values.add(new ScoredItem(value.getItem(), value.getCount(), value.getError())); } Collections.sort(values); values = values.size() > k ? values.subList(0, k) : values; return values; } }