package org.streaminer.stream.sampler; import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Random; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Implementation of the chain-sample algorithm. When an item is added to the sample, * a random replacement is generated ad upon the arrival of this item, the old one * is replaced, composing a chain of replacements. * * Reference: * Babcock, Brian, Mayur Datar, and Rajeev Motwani. "Sampling from a moving * window over streaming data." Proceedings of the thirteenth annual ACM-SIAM * symposium on Discrete algorithms. Society for Industrial and Applied * Mathematics, 2002. * * @author Maycon Viana Bordin <mayconbordin@gmail.com> */ public class ChainSampler implements ISampleList<Serializable> { private static final Logger LOG = LoggerFactory.getLogger(ChainSampler.class); private int k; private int n; private long count = 0; private int fillSample = 0; private Serializable[] items; private Map<Integer, Integer> replacements; private Random rand = new Random(); /** * Create a new sampler of size k for a window of size n. * @param k The size of the sample * @param n The size of the window */ public ChainSampler(int k, int n) { this.k = k; this.n = n; items = new Serializable[k]; replacements = new HashMap<Integer, Integer>(k); } /** * Give an item to the sampler so that it can decide to add or not as a sample. * @param item The item to be sampled */ public void sample(Serializable item) { int i = (int) (count%n); if (replacements.containsKey(i)) { int replace = replacements.get(i); // replace the old item items[replace] = item; int next = rand.nextInt(n); LOG.info(String.format("Item=%s; i=%d; b=%d; next=%d", item, i, replace, next)); replacements.remove(i); replacements.put(next, replace); } // this will build the initial sample else if (fillSample < k) { double prob = ((double)Math.min(i, n))/((double)n); if (rand.nextDouble() < prob) { int bucket = fillSample++; int next = rand.nextInt(n); items[bucket] = item; replacements.put(next, bucket); LOG.info(String.format("[init] Item=%s; i=%d; b=%d; next=%d", item, i, bucket, next)); } } count++; } /** * Sample a list of items * @param t The list of items to be sampled */ public void sample(Serializable... t) { for (Serializable item : t) { sample(item); } } /** * @return The list of currently sampled items */ public Collection<Serializable> getSamples() { List<Serializable> result = new ArrayList<Serializable>(); for (Serializable item : items) if (item != null) result.add(item); return result; } /** * @return The size of the sample */ public int getSize() { return (fillSample < k) ? fillSample : k; } }