package org.streaminer.stream.frequency; import org.streaminer.util.hash.function.HashFunction; import org.streaminer.util.hash.function.TwoUniversalHashFunction; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; /** * <p> * {@link Learner}-part of the implementation of the CountMinSketch algorithm from the paper * 'An improved data stream summary: the count-min sketch and its * applications' written by * 'Cormode, G. and Muthukrishnan, S. (2003)'. * </p> * * @author Marcin Skirzynski (main work), Benedikt Kulmann (modifications) */ public class CountMinSketch<T> extends CountSketch<T> { /** * <p> * Constructor of the CountMinSketch algorithm. This construction * can take quite a long time since the construction of the hashfunctions * is rather time-consuming. * </p> * * @param domain The (estim.) domain, i.e. how many different items are expected * @param numberOfHashFunctions The number of hashfunctions which determine a bucket * @param numberOfBuckets The number of buckets where a counter will be maintained * @param k parameter for the top-k variant. If you want to disable * the top-k overhead, than set k to 0 or lower */ public CountMinSketch(int domain, int nrOfHashFunctions, int nrOfbuckets, int k) { super(domain, nrOfHashFunctions, nrOfbuckets, k); } /** * <p> * We only need the h-hash functions since * the CountMinSketch algorithm always increments * the values in the data. * </p> * * @param domain the (estim.) domain, i.e. how many different items are expected * @param nrOfHashFunctions the number of hashfunctions which determine a bucket * @param nrOfbuckets the number of buckets where a counter will be maintained */ protected void initializeHashes(int domain, int nrOfHashFunctions, int nrOfbuckets) { h = new ArrayList<HashFunction<T>>(); s = new ArrayList<HashFunction<T>>(); for (int i = 0; i < nrOfHashFunctions; i++) { h.add(new TwoUniversalHashFunction<T>(domain, nrOfbuckets)); } } /** * <p> * Updating the data. For each hashfunction the corresponding * bucket will be incremented by one. * </p> */ @Override protected boolean updateData(T item) { for (int i = 0; i < h.size(); i++) { int hi = (int) h.get(i).hash(item); data[i][hi] += 1; } return k <= 0; } /** * <p> * Estimates the frequency by returning the * smallest frequency value for each hasfunction index. * </p> */ @Override public long estimateFrequency(T item) { Collection<Integer> values = new ArrayList<Integer>(); for (int i = 0; i < h.size(); i++) { int hi = (int) h.get(i).hash(item); values.add(data[i][hi]); } return Collections.min(values); } }