package org.streaminer.stream.frequency;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.streaminer.stream.frequency.util.CountEntry;
import org.streaminer.util.hash.HashUtils;
/**
* Combinatorial Group Testing to Find Frequent Items.
*
* Reference:
* G. Cormode. What's Hot and What's Not: Tracking Frequent Items Dynamically, PODS 2003
*
* <a href="http://www.cs.rutgers.edu/~muthu/massdal-code-index.html">Original code</a>
*
* @author Maycon Viana Bordin <mayconbordin@gmail.com>
*/
public class CGT implements IBaseFrequency<Integer>, IFrequencyList<Integer> {
public static final int DEFAULT_THRESHOLD = 1;
private int tests;
private int logn;
private int gran;
private int buckets;
private int subbuckets;
private int count;
private int[][] counts;
private long[] testa;
private long[] testb;
private Random random = new Random();
/**
* Create the data structure for Combinatorial Group Testing.
*
* @param buckets Each test has buckets buckets
* @param tests Keep T tests
* @param logn logn is the bit depth of the items which will arrive, this
* code assumes lgn <= 32 since it manipulates unsigned ints
* @param gran gran is the granularity at which to perform the testing
* gran = 1 means to do one bit at a time,
* gran = 4 means to do one nibble at time
* gran = 8 means to do one octet at a time, etc.
*/
public CGT(int buckets, int tests, int logn, int gran) {
this.tests = tests;
this.logn = logn;
this.gran = gran;
this.buckets = buckets;
subbuckets = 1 + (logn/gran) * ((1 << gran) - 1);
count = 0;
testa = new long[tests];
testb = new long[tests];
counts = new int[buckets*tests][subbuckets];
// initialise the hash functions
for (int i=0; i<tests; i++) {
testa[i] = random.nextLong();
if (testa[i] < 0)
testa[i]= -testa[i];
testb[i] = random.nextLong();
if (testb[i] < 0)
testb[i] = -testb[i];
}
}
public boolean add(Integer item) throws FrequencyException {
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
}
public boolean add(Integer item, long incrementCount) throws FrequencyException {
long hash;
int offset = 0;
count += incrementCount;
for (int i=0; i<tests; i++) {
hash = HashUtils.hash31(testa[i], testb[i], item);
hash = hash % buckets;
logInsert((int)(offset+hash), item, (int)incrementCount);
offset += buckets;
}
return true;
}
public long size() {
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
}
public List<CountEntry<Integer>> getFrequentItems(double minSupport) {
Map<Integer,CountEntry<Integer>> results = new HashMap<Integer,CountEntry<Integer>>();
int thresh = (int) minSupport;
int testval=0, hash=0, guess = 0;
boolean pass;
for (int i=0; i<tests; i++) {
for (int j=0; j<buckets; j++) {
guess = (int) findOne(testval, thresh);
// go into the group, and see if there is a frequent item there
// then check item does hash into that group...
if (guess > 0) {
hash = (int) HashUtils.hash31(testa[i], testb[i], guess);
hash = hash % buckets;
}
if (guess > 0 && hash == j) {
pass = true;
for (int k=0; k<tests; k++) {
// check every hash of that item is above threshold...
hash = (int) HashUtils.hash31(testa[k], testb[k], guess);
hash = (buckets * k) + (hash % buckets);
//System.out.println("item="+guess+"; freq="+counts[hash][0]);
if (counts[hash][0] < thresh)
pass = false;
}
if (pass) {
// if the item passes all the tests, then output it
results.put(guess, new CountEntry<Integer>(guess, counts[hash][0]));
}
}
testval++;
}
}
return new ArrayList(results.values());
}
/**
*
* @param pos
* @param thresh
* @return The identity of the frequent item if there was one or zero if there was none.
*/
private long findOne(int pos, int thresh) {
int k = 0;
int offset, countabove, sum, last;
// if the count is not above threshold, then reject
if (counts[pos][0] >= thresh) {
offset = 1;
for (int i=logn; i>0; i-=gran) {
k <<= gran;
countabove=0; sum=0; last=0;
for (int l=1; l<(1 << gran); l++) {
if (counts[pos][offset] >= thresh) {
countabove++;
last = l;
}
sum += counts[pos][offset++];
}
if (counts[pos][0] - sum >= thresh)
countabove++;
// check: if both halves of a group are above threshold,
// then reject the whole group
if (countabove != 1) {
k = 0;
break;
}
// Update the record of the identity of the frequent item
k += last;
}
}
return k;
}
private void logInsert(int pos, int val, int inc) {
int bitmask = (1 << gran) - 1;
int offset = ((logn/gran)*bitmask) - bitmask;
// add the increment to the count of the group
counts[pos][0] += inc;
for (int i=logn; i>0; i-=gran) {
if ((val & bitmask) != 0) // if the lsb = 1, then add on to that group
counts[pos][offset + (val&bitmask)] += inc;
val >>= gran; // look at the next set of bits
offset -= bitmask;
}
}
public Set<Integer> keySet() {
return null;
}
public List<CountEntry<Integer>> peek(int k) {
return peek(k, DEFAULT_THRESHOLD);
}
public List<CountEntry<Integer>> peek(int k, double minSupport) {
List<CountEntry<Integer>> items = getFrequentItems(minSupport);
Collections.sort(items);
if (items.size() > k)
return items.subList(0, k);
else
return items;
}
public List<CountEntry<Integer>> getFrequentItems() {
return getFrequentItems(DEFAULT_THRESHOLD);
}
}