/* * Licensed to the Ted Dunning under one or more contributor license * agreements. See the NOTICE file that may be * distributed with this work for additional information * regarding copyright ownership. Ted Dunning licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.mapr.stats; import com.google.common.base.Preconditions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Arrays; /** * Retains top samples from a stream by using a heap data structure. */ public class UpperQuantile { private static Logger log = LoggerFactory.getLogger(UpperQuantile.class); private int n = 0; private Heap biggest; private double[] sorted; public UpperQuantile(int maxRetained) { biggest = new Heap(maxRetained); } void add(double x) { sorted = null; biggest.add(x); n++; } /** * To find a particular quantile, we have to look at the top values in sorted order. * * @param q The quantile to estimate. * @return The value of the quantile. */ double quantile(double q) { Preconditions.checkState(biggest.size() > 0, "Can't get quantile with no data"); Preconditions.checkArgument(q >= 0, "Q must be >= 0"); Preconditions.checkArgument(q <= 1, "Q must be <= 1"); // how far from the max value? double item = (n - 1) * (1 - q); Preconditions.checkArgument(item <= biggest.size() - 1, "Can't get %.2 %-ile, only retained %d / %d items", 100 * q, biggest.size(), n); // and how far is that from the beginning of our retained samples? item = biggest.size() - item; // only sort the data once if (sorted == null) { sorted = biggest.data.clone(); Arrays.sort(sorted, 1, sorted.length); } // may want to interpolate values int i = (int) Math.floor(item); double fraction = item - i; if (fraction > 0) { return sorted[i] * (1 - fraction) + sorted[i + 1] * fraction; } else { return sorted[i]; } } public void clear() { biggest.clear(); } /** * Verify that the heap is well formed. */ public void validate() { biggest.validate(1); } /** * Print the heap to standard out. */ public void print() { biggest.print(1); } /** * See http://en.wikipedia.org/wiki/Heap_(data_structure) */ private static final class Heap { private int count = 0; private double[] data; private Heap(int size) { Preconditions.checkArgument(size > 0); data = new double[size + 1]; } private void add(double x) { if (count < data.length - 1) { // if we delayed initialization until the heap fills, we could go a bit faster (Floyd's algo) // if we often only add as many samples as are allowed in the heap, then that could make a significant // difference count++; data[count] = x; bubble(count); } else if (x > data[1]) { // x is bigger than the root, so x deserves to be in the heap and the current root does not pop(); add(x); } } /** * After a leaf node is placed at location i, the invariants may be violated. * This method restores the invariant. * * @param i The index of the leaf node which has been changed. */ private void bubble(int i) { if (i > 1) { int parent = i / 2; if (data[i] < data[parent]) { double tmp = data[i]; data[i] = data[parent]; data[parent] = tmp; bubble(parent); } } } /** * Remove the root of the heap and restore the invariant. * * @return The smallest value in the heap. */ private double pop() { Preconditions.checkState(count > 0, "Can't pop from empty heap"); double r = data[1]; pop(1); return r; } private double peek() { return data[1]; } private void pop(int i) { int left = 2 * i; if (left <= count) { int right = left + 1; if (right <= count) { if (data[left] < data[right]) { data[i] = data[left]; pop(left); } else { data[i] = data[right]; pop(right); } } else { data[i] = data[left]; pop(left); } } else if (i < count) { // leaf ... move the last element to here and rebalance upward data[i] = data[count]; count--; bubble(i); } else { count--; } } public double size() { return count; } public void clear() { count = 0; } /** * Rebalances the entire tree from the top-down. Floyd's algorithm would do it from * the bottom up and be faster, but this isn't used so it doesn't matter. * * @param base The base of the sub-heap to be balanced. */ private void rebalance(int base) { int left = 2 * base; if (left <= count && data[base] > data[left]) { double tmp = data[left]; data[left] = data[base]; data[base] = tmp; rebalance(left); } int right = left + 1; if (right <= count && data[base] > data[right]) { double tmp = data[right]; data[right] = data[base]; data[base] = tmp; rebalance(right); } } /** * Verify the invariant. Bitch if an error are found. * * @param i The index of the root of the tree to check. */ private boolean validate(int i) { if (i >= count) { return true; } int left = 2 * i; if (left <= count) { if (data[i] > data[left]) { log.warn("Data at {} > {}", i, left); log.warn("Data at {} > {}", data[i], data[left]); return false; } if (!validate(left)) { return false; } int right = left + 1; if (right <= count) { if (data[i] > data[right]) { log.warn("Data at {} > {}", i, right); log.warn("Data at {} > {}", data[i], data[right]); return false; } if (!validate(right)) { return false; } } } return true; } private void print(int i) { if (i <= count) { System.out.printf("%4d, %4d ", i, 31 - Integer.numberOfLeadingZeros(i)); indent(31 - Integer.numberOfLeadingZeros(i)); System.out.printf("%.3f\n", data[i]); print(2 * i); print(2 * i + 1); } } private void indent(int indent) { for (int j = 0; j < indent; j++) { System.out.printf("| "); } } } }