/** * */ package org.streaminer.stream.model; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * <p> * This class implements a histogram-model observed from a data stream. The input data * elements are regarded to be nominal elements, i.e. Java Strings. * </p> * * @author Christian Bockermann <chris@jwall.org> * */ public class NominalDistributionModel<T extends Serializable> implements SelectiveDescriptionModel<T, Double>, Distribution<T> { /** The unique class ID */ private static final long serialVersionUID = -4642672370564928117L; /* A global logger for this class */ static Logger log = LoggerFactory.getLogger( NominalDistributionModel.class ); /* The total number of elements observed by this model */ Integer count = 0; /* The maximum number of elements kept in the count-map */ Integer max = Integer.MAX_VALUE; /* The map of counts, i.e. the frequencies for the observed elements */ Map<T,Integer> counts = new LinkedHashMap<T,Integer>(); /** * Creates a new nominal distribution model with an infinite number * of objects being tracked. Infinite here depends on available memory * and is initially assumed as <code>Integer.MAX_VALUE</code> */ public NominalDistributionModel(){ this( Integer.MAX_VALUE ); } /** * Creates a new nominal distribution model. The parameter specifies * the maximum number of distinct elements that will be counted in this * model. * * @param maxElements */ public NominalDistributionModel( int maxElements ){ this.max = maxElements; this.counts = new LinkedHashMap<T,Integer>(); this.count = 0; } /** * @see stream.model.SelectiveDescriptionModel#describe(java.lang.Object) */ @Override public Double describe(Serializable parameter) { Integer cnt = counts.get( parameter ); if( cnt == null ) return 0.0d; return cnt.doubleValue() / count; } /** * Add a new value to the model. * @param newVal */ public void update( T newVal ){ if( newVal == null ){ log.warn( "Skipping 'null' value!" ); return; } synchronized( counts ){ Integer cnt = counts.get( newVal ); if( cnt == null ){ cnt = new Integer(1); } else cnt = new Integer( cnt.intValue() + 1 ); counts.put( newVal, cnt ); count++; } } /* (non-Javadoc) * @see stream.model.DistributionModel#getHistogram() */ public Map<T,Double> getHistogram(){ Map<T,Double> map = new LinkedHashMap<T,Double>(); for( T key : counts.keySet() ) map.put( key, counts.get(key).doubleValue() ); return map; } /* (non-Javadoc) * @see stream.model.DistributionModel#getCount() */ public Integer getCount(){ return count; } public Set<T> getElements(){ return counts.keySet(); } public Integer getCount( T value ){ Integer cnt = counts.get( value ); if( cnt == null ) return 0; return cnt; } /** * <p> * This method trancates the size of this distribution-model by removing * elements until only the given maximum number of elements resides in the * count-map. * </p> * <p> * Specifying any value < 1 for <code>maxElements</code> will completely * prune all elements. * </p> * * @param maxElements */ public void truncate( int maxElements ){ log.trace( "Truncating distribution to {} elements", maxElements ); if( maxElements < 1 ){ count = 0; counts.clear(); return; } synchronized( counts ){ List<T> elements = new ArrayList<T>( counts.keySet() ); Collections.sort( elements, new DistributionComparator( counts ) ); log.trace( "Sorted elements: {}", elements ); int removed = 0; for( int i = 0; counts.size() > maxElements; i++ ){ Integer cnt = counts.remove( elements.get(i) ); count = count - cnt; removed++; } log.debug( "removed {} elements", removed ); } } /** * This comparator can be used for sorting elements in ascending order, based * on their frequencies. */ class DistributionComparator implements Comparator<T> { Map<T,Integer> counts; public DistributionComparator( Map<T,Integer> counts ){ this.counts = counts; } /** * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) */ @Override public int compare(T arg0, T arg1) { Integer c0 = counts.get( arg0 ); if( c0 == null ) c0 = 0; Integer c1 = counts.get( arg1 ); if( c1 == null ) c1 = 0; int rc = c0.compareTo( c1 ); if( rc == 0 ){ return arg0.toString().compareTo( arg1.toString() ); } return rc; } } /** * @see stream.model.Distribution#prob(java.io.Serializable) */ @Override public Double prob(T value) { Integer cnt = counts.get( value ); if( cnt == null ) return 0.0d; return cnt.doubleValue() / count.doubleValue(); } }