/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.math; import java.util.ArrayList; import java.util.Comparator; import java.util.Iterator; import java.util.List; import xxl.core.collections.queues.Heap; import xxl.core.cursors.Cursors; import xxl.core.cursors.mappers.Aggregator; import xxl.core.functions.AbstractFunction; import xxl.core.functions.Function; import xxl.core.math.functions.AggregationFunction; import xxl.core.math.functions.RealFunction; import xxl.core.math.functions.RealFunctionFunction; import xxl.core.math.numerics.integration.SimpsonsRuleRealFunctionArea; import xxl.core.math.statistics.nonparametric.kernels.GaussianKernel; import xxl.core.math.statistics.parametric.aggregates.StatefulAverage; import xxl.core.math.statistics.parametric.aggregates.Count; import xxl.core.math.statistics.parametric.aggregates.Maximum; import xxl.core.math.statistics.parametric.aggregates.Minimum; import xxl.core.math.statistics.parametric.aggregates.StatefulStandardDeviation; import xxl.core.util.Distance; import xxl.core.util.DoubleArrays; /** This class contains some useful static methods for dealing with statistical functions. */ public class Statistics { /** Precomputed constant value used for computing p-quantils of the * standard normal distribution. */ protected static final double A0 = 2.50662823884; /** Precomputed constant value used for computing p-quantils of the * standard normal distribution. */ protected static final double A1 = -18.61500062529; /** Precomputed constant value used for computing p-quantils of the * standard normal distribution. */ protected static final double A2 = 41.39119773534; /** Precomputed constant value used for computing p-quantils of the * standard normal distribution. */ protected static final double A3 = -25.44106049637; /** Precomputed constant value used for computing p-quantils of the * standard normal distribution. */ protected static final double B1 = -8.47351093090; /** Precomputed constant value used for computing p-quantils of the * standard normal distribution. */ protected static final double B2 = 23.08336743743; /** Precomputed constant value used for computing p-quantils of the * standard normal distribution. */ protected static final double B3 = -21.06224101826; /** Precomputed constant value used for computing p-quantils of the * standard normal distribution. */ protected static final double B4 = 3.13082909833; /** Precomputed constant value used for computing p-quantils of the * standard normal distribution. */ protected static final double C0 = -2.78718931138; /** Precomputed constant value used for computing p-quantils of the * standard normal distribution. */ protected static final double C1 = -2.29796479134; /** Precomputed constant value used for computing p-quantils of the * standard normal distribution. */ protected static final double C2 = 4.85014127135; /** Precomputed constant value used for computing p-quantils of the * standard normal distribution. */ protected static final double C3 = 2.32121276858; /** Precomputed constant value used for computing p-quantils of the * standard normal distribution. */ protected static final double D1 = 3.54388924762; /** Precomputed constant value used for computing p-quantils of the * standard normal distribution. */ protected static final double D2 = 1.63706781897; /** * The default constructor has private access in order to ensure * non-instantiability. */ private Statistics() { } /** Computes the p-quantil (lower tail area) of the standard normal distribution. * Based on:<br>P.Griffiths, I.D.Hill: Applied Statistics Algorithms: * Ellis Horwood Limited: 1985 :p.188 ff. (AS 111). * * @param p quantil to compute * @return p-quantil of the N(0,1)-Distribution. */ public static double normalQuantil(double p) { if (p <= 0 || p >= 1) { throw new IllegalArgumentException("quantile to compute must be in (0,1)!"); } double ppnd = 0.0; double r = 0.0; if (p <= 0.0) return 0.0 / 0.0; if (p >= 1.0) return 0.0; double q = p - 0.5; if (!(q >= 0.42)) { r = q * q; ppnd = q * (((A3 * r + A2) * r + A1) * r + A0) / ((((B4 * r + B3) * r + B2) * r + B1) * r + 1.0); } else { if (q >= 0.0) r = 1.0 - p; if (r <= 0.0) return 0.0 / 0.0; r = Math.sqrt(-1.0 * Math.log(r)); ppnd = (((C3 * r + C2) * r + C1) * r + C0) / ((D2 * r + D1) * r + 1.0); if (q <= 0.0) ppnd = -1.0 * ppnd; } return ppnd; } /** Converts a cumulative frequency distribution of numerical data into two * arrays of types <tt>double []</tt> for the data and <tt>int []</tt> for * the frequencies. * * @param cfd cumulative frequency distribution of numerical data * @param mapping function that maps the data values to an alternative representation * @return an <tt>Object []</tt> filled with a <tt>double []</tt> containing the data * and an <tt>int []</tt> containing the corresponding frequencies */ public static Object[] doubleArrayCFD(Iterator cfd, Function mapping) { List a = new ArrayList(); Cursors.toList(cfd, a); double[] data = new double[a.size()]; int[] freqencies = new int[a.size()]; Iterator it = a.iterator(); int pos = -1; Object[] temp = null; while (it.hasNext()) { pos++; temp = (Object[]) it.next(); data[pos] = ((Double) mapping.invoke(temp[0])).doubleValue(); freqencies[pos] = ((Long) temp[1]).intValue(); } return new Object[] { data, freqencies }; } /** Converts a cumulative frequency distribution of numerical data into two * arrays of types <tt>double []</tt> for the data and <tt>int []</tt> for * the frequencies. * * @param cfd cumulative frequency distribution of numerical data * @return an <tt>Object []</tt> filled with a <tt>double []</tt> containing the data * and a <tt>int []</tt> containing the corresponding frequencies */ public static Object[] doubleArrayCFD(Iterator cfd) { return doubleArrayCFD(cfd, new AbstractFunction() { public Object invoke(Object o) { return new Double(((Number) o).doubleValue()); } }); } /** Evaluates a real-valued function on a given grid of double values. * Real-valued function means that the given function consumes objects of type <tt>Double</tt> * and returns Objects of type {@link java.lang.Number Number}. * * @param grid given data points to evaluate * @param realFunction real-valued function (one dimensional) to evaluate * @return a <tt>double []</tt> filled with the function values on the grid */ public static double[] evalReal1DFunction( double[] grid, Function realFunction) { double[] r = new double[grid.length]; for (int i = 0; i < r.length; i++) { r[i] = ((Number) realFunction.invoke(new Double(grid[i]))).doubleValue(); } return r; } /** Evaluates a {@link xxl.core.math.functions.RealFunction real-valued function} * on a given grid of double values. * * @param grid given data points to evaluate * @param f function to evaluate * @return a <tt>double []</tt> filled with the function values on the grid */ public static double[] evalRealFunction(double[] grid, RealFunction f) { double[] r = new double[grid.length]; for (int i = 0; i < r.length; i++) { r[i] = f.eval(grid[i]); } return r; } /** Evaluates a given real-valued function on an interval [left,right] with a * predefined number of points. * Real-valued function means that the given function consumes objects of type <tt>Double</tt> * and returns Objects of type {@link java.lang.Number Number}. * * @param a left border of the interval to evaluate * @param b right border of the interval to evaluate * @param n number of points to evaluate * @param realFunction real-valued function (one dimensional) to evaluate * @return a <tt>double []</tt> filled with the function values on the grid */ public static double[] evalReal1DFunction( double a, double b, int n, Function realFunction) { return evalReal1DFunction(DoubleArrays.equiGrid(a, b, n), realFunction); } /** Evaluates a {@link xxl.core.math.functions.RealFunction real-valued function} * on equally spaced, real-valued data points. * * @param a left border of the interval to evaluate * @param b right border of the interval to evaluate * @param n number of points to evaluate * @param f function to evaluate * @return <tt>double []</tt> filled with the function values on the grid */ public static double[] evalRealFunction( double a, double b, int n, RealFunction f) { return evalRealFunction(DoubleArrays.equiGrid(a, b, n), f); } /** Evaluates a given real-valued function on a given grid of double values. * Real-valued function means that the given function consumes objects of type <tt>Double</tt> * and returns Objects of type {@link java.lang.Number Number}. * * @param grid given data points to evaluate * @param realFunction real-valued function (one dimensional) to evaluate * @return a <tt>double []</tt> filled with the evaluated real-valued function points alternately with the * data points, meaning the returned array consists of {x0, y0, x1, y1, x2, y2, ..., xn-1, yn-1} * with f(xi) = yi for all i = 0, ... , n-1 */ public static double[] evalReal1DFunctionX( double[] grid, Function realFunction) { double[] r = new double[2 * grid.length]; for (int i = 0; i < r.length; i = i + 2) { int k = (i / 2); r[i] = grid[k]; r[i + 1] = ((Number) realFunction.invoke(new Double(grid[k]))) .doubleValue(); } return r; } /** Evaluates a given real-valued function on equally spaced real-valued data points. * Real-valued function means that the given function consumes objects of type <tt>Double</tt> * and returns Objects of type {@link java.lang.Number Number}. * * @param a left border of the interval to evaluate * @param b right border of the interval to evaluate * @param n number of points to evaluate * @param realFunction real-valued function (one dimensional) to evaluate * @return a <tt>double []</tt> filled with the evaluated real-valued function points alternately with the * data points, meaning the returned array consists of {x0, y0, x1, y1, x2, y2, ..., xn-1, yn-1} * with f(xi) = yi for all i = 0, ... , n-1 */ public static double[] evalReal1DFunctionX( double a, double b, int n, Function realFunction) { return evalReal1DFunctionX( DoubleArrays.equiGrid(a, b, n), realFunction); } /** * Computes the value of the Gaussian probability density function (pdf) at a given point x. * * @param x argument where to evaluate the pdf * @param mean mean of the pdf * @param variance variance of the pdf * @throws IllegalArgumentException if variance < 0 * @return value of the pdf at position x */ public static double gaussian(double x, double mean, double variance) throws IllegalArgumentException { if (variance < 0.0) throw new IllegalArgumentException("variance has to be >= 0!"); return Math.exp(Math.pow(x-mean,2)/(-2.0*Math.pow(variance,2)))/(variance*Math.sqrt(2.0*Math.PI)); } /** * Computes the probability density function (pdf) of the standard Gaussian distribution (normal pdf). * An expectation value of 0.0 and a variance of 1.0 will be assumed. * * @param x argument where to evaluate the pdf * @return value of the pdf at position x * @throws IllegalArgumentException invalid argument */ public static double gaussian(double x) throws IllegalArgumentException { return gaussian(x, 0.0, 1.0); } /** * Returns a numerical approximation of the primitive of the standard Gaussian distribution (normal pdf). * * @param argument where to evaluate the pdf * @param n iteration number for the numerical approximation * @return value of the pdf at position x */ public static double gaussianPrimitive(double x, int n) { if(x < -3) return 0; if(x > 3) return 1; return SimpsonsRuleRealFunctionArea.simpson(-3, x, new RealFunctionFunction(new GaussianKernel()), n); } /** * Returns a numerical approximation of the primitive of the standard Gaussian distribution (normal pdf). * * @param argument where to evaluate the pdf * @return value of the pdf at position x */ public static double gaussianPrimitive(double x) { return gaussianPrimitive(x, 100); } /* --- kernel functions ---*/ /** Evaluates the Epanechnikow kernel. * * @param x function argument * @return f(x) = 0.75 ( 1 - x^2) * I(x)_[-1,1] * @see xxl.core.math.statistics.nonparametric.kernels.EpanechnikowKernel */ public static double epanechnikow(double x) { if ((x < -1) || (x > 1)) return 0; return 0.75 * (1 - x * x); } /** Evaluates the primitive of the Epanechnikow kernel. * * @param x function argument * @return value of the primitive of the Epanechnikow kernel * @see xxl.core.math.statistics.nonparametric.kernels.EpanechnikowKernel */ public static double epanechnikowPrimitive(double x) { return epanechnikowPrimitive( x, (-1.0) * epanechnikowPrimitive(-1.0, 0.0)); } /** Evaluates the primitive of the Epanechnikow kernel and adds a constant c. * * @param x function argument * @param c constant added to the primitive * @return value of the primitive of the Epanechnikow kernel plus a constant c * @see xxl.core.math.statistics.nonparametric.kernels.EpanechnikowKernel */ public static double epanechnikowPrimitive(double x, double c) { double r = 0.25 * (3 * x - x * x * x) + c; if(x < -1) return 0; if(x > 1) return 1; return r; } /** Evaluates the Biweight kernel. * * @param x function argument * @return value of the Biweight kernel * @see xxl.core.math.statistics.nonparametric.kernels.BiweightKernel */ public static double biweight(double x) { double r = 0.9375 * (1 - x * x) * (1 - x * x); if ((x < -1) || (x > 1)) r = 0.0; return r; } /** Evaluates the primitive of the Biweight kernel and adds a constant c. * * @param x function argument * @param c constant added to the primitive * @return value of the primitive of the Biweight kernel plus a constant c * @see xxl.core.math.statistics.nonparametric.kernels.BiweightKernel */ public static double biweightPrimitive(double x, double c) { if(x < -1) return 0; if(x > 1) return 1; return 0.1875 * x * x * x * x * x - 0.625 * x * x * x + 0.9375 * x + c; } /** Evaluates the primitive of the Biweight kernel. * * @param x function argument * @return value of the primitive of the Biweight kernel * @see xxl.core.math.statistics.nonparametric.kernels.BiweightKernel */ public static double biweightPrimitive(double x) { return biweightPrimitive(x, (-1.0) * biweightPrimitive(-1.0, 0.0)); } /** Evaluates the derivative of the Biweight kernel. * * @param x function argument * @return value of the derivative of the Biweight kernel * @see xxl.core.math.statistics.nonparametric.kernels.BiweightKernel */ public static double biweightDerivative(double x) { double r = 0.0; if ((x >= -1) && (x <= 1)) { r += 3.75 * x * (x * x - 1); } return r; } /** Evaluates the Triweight kernel. * * @param x function argument * @return value of the Triweight kernel * @see xxl.core.math.statistics.nonparametric.kernels.TriweightKernel */ public static double triweight(double x) { double r = 1.09375 * (1 - x * x) * (1 - x * x) * (1 - x * x); if ((x < -1) || (x > 1)) r = 0.0; return r; } /** Evaluates the primitive of the Triweight kernel and adds a constant c. * * @param x function argument * @param c constant added to the primitive * @return the primitive of the Triweight function evaluated at the given argument * @see xxl.core.math.statistics.nonparametric.kernels.TriweightKernel */ public static double triweightPrimitive(double x, double c) { if(x < -1) return 0; if(x > 1) return 1; double r = -Math.pow(x,7)/7 + Math.pow(x,5)*0.6 - Math.pow(x,3) + x + 16/35.0; r *= 35/32.0; return r; } /** Evaluates the primitive of the Triweight kernel. * * @param x function argument * @return value of the primitive of the Triweight kernel * @see xxl.core.math.statistics.nonparametric.kernels.TriweightKernel */ public static double triweightPrimitive(double x) { return triweightPrimitive(x, (-1.0) * triweightPrimitive(-1.0, 0.0)); } /** Evaluates the derivative of the Triweight kernel. * * @param x function argument * @return value of the derivative of the Triweight kernel * @see xxl.core.math.statistics.nonparametric.kernels.TriweightKernel */ public static double triweightDerivative(double x) { double r = 0.0; if ((x >= -1) && (x <= 1)) { r += 2.0 * (x * x * x) - (x * x * x * x * x) - x; r *= 6.5625; } return r; } /** Evaluates the CosineArch kernel. * * @param x function argument * @return value of the CosineArch kernel * @see xxl.core.math.statistics.nonparametric.kernels.CosineArchKernel */ public static double cosineArch(double x) { double r = 0.25 * Math.PI * Math.cos(0.5 * Math.PI * x); if ((x < -1) || (x > 1)) r = 0.0; return r; } /** Evaluates the primitive of the CosineArch kernel and adds a constant c. * * @param x function argument * @param c constant added to the primitive * @return value of the primitive of the CosineArch kernel plus a constant c * @see xxl.core.math.statistics.nonparametric.kernels.CosineArchKernel */ public static double cosineArchPrimitive(double x, double c) { if(x < -1) return 0; if(x > 1) return 1; return 0.5 * Math.sin(0.5 * Math.PI * x) + c; } /** Evaluates the primitive of the CosineArch kernel. * * @param x function argument * @return value of the primitive of the CosineArch kernel * @see xxl.core.math.statistics.nonparametric.kernels.CosineArchKernel */ public static double cosineArchPrimitive(double x) { return cosineArchPrimitive(x, (-1.0) * cosineArchPrimitive(-1.0, 0.0)); } /** Evaluates the derivative of the CosineArch kernel. * * @param x function argument * @return value of the derivative of the CosineArch kernel * @see xxl.core.math.statistics.nonparametric.kernels.CosineArchKernel */ public static double cosineArchDerivative(double x) { double r = 0.0; if ((x >= -1) && (x <= 1)) { r += -0.125 * Math.PI * Math.PI * Math.sin(0.5 * Math.PI * x); } return r; } /* common statistics */ /** Returns the variance of a given dataset. The data is assumed to be * numerical, i.e. of type {@link java.lang.Number Number}. * * @param data numerical data * @return variance of the data */ public static double variance(Object[] data) { double r = 0.0; double avg = average(data); double x = 0.0; for (int i = 0; i < data.length; i++) { x = ((Number) data[i]).doubleValue(); r += (x - avg) * (x - avg); } return r / data.length; } /** Returns the unbiased variance of a given sample. The data is assumed to be * numerical ,i.e., of type {@link java.lang.Number Number}. * * @param data numerical data * @return variance of the data */ public static double sampleVariance(Object[] data) { double r = 0.0; double avg = average(data); double x = 0.0; for (int i = 0; i < data.length; i++) { x = ((Number) data[i]).doubleValue(); r += (x - avg) * (x - avg); } return r / (data.length - 1); } /** Returns the average of a given dataset. The data is assumed to be * numerical i.e. of type {@link java.lang.Number Number}. * * @param data numerical data * @return average of the data */ public static double average(Object[] data) { double r = 0.0; for (int i = 0; i < data.length; i++) { r += ((Number) data[i]).doubleValue(); } return r / data.length; } /** Computes different aggregates (statistics) of a given set of data. * * @param data data given as stream containing Objects of type <TT>Number</TT> * @return An <TT>Object[] o</TT> containing Objects representing the computed statistics as:<BR> * <TT>o[0]</TT>: number of processed data <BR> * <TT>o[1]</TT>: minimum of processed data <BR> * <TT>o[2]</TT>: maximum of processed data <BR> * <TT>o[3]</TT>: average of processed data <BR> * <TT>o[4]</TT>: standard deviation of processed data <BR> */ public static Object[] getStatistics(Iterator data) { return ((List)(new Aggregator(data, Maths.multiDimAggregateFunction(new AggregationFunction[] { new Count(), new Minimum(), new Maximum(), new StatefulAverage(), new StatefulStandardDeviation()}))) .last()).toArray(); } /** This method computes the n maximum differences in the given data. To do so, a * {@link xxl.core.util.Distance distance} must be defined for the data. * For example, the cumulative * frequency distribution could be used with a difference function defined as * the differences of the frequencies of the data points. * * @param data data used to compute the max-diff-positions * @param distance distance function used for determining the n biggest differences * @param n number of differences to find * @param noZero indicates whether the distance of two objects which is equal to zero is allowed * @throws IllegalArgumentException if the number of differences doesn't match the number * of given data ( n too big or too less data) * @return the indices (positions where the data occurred in the iterator) of the n biggest * differences according to the given distance function, starting with the (left) index of the n-th * biggest difference and ending with the biggest one */ public static int [] maxDiff ( Iterator data, Distance distance, int n, boolean noZero) { // init the used min-heap, ordering according to the distances // the objects inserted into the heap will be Object[] Heap heap = new Heap ( n, new Comparator (){ public int compare ( Object o1, Object o2){ double d1 = ((Number) ((Object []) o1)[0]).doubleValue(); double d2 = ((Number) ((Object []) o2)[0]).doubleValue(); return d1 < d2 ? -1 : d1 == d2 ? 0 : 1; } }); heap.open(); // get the first element Object previous = null; if ( data.hasNext () ) { previous = data.next(); } else throw new IllegalArgumentException("input data is empty, but need at least " + (n+1) + " objects!"); // init helping vars int inserted = 0; // number of inserted distances or number of Objects in the heap int position = 0; // numbering the processed data (index) Object next = null; // processing the data while ( data.hasNext() ){ next = data.next(); position++; // increase the index // computing difference double dist = distance.distance ( previous, next); // should trivial distances be allowed if (!( noZero && (dist == 0.0))) { // enough data inserted into the heap if ( inserted < n ){ // no heap.enqueue( new Object[]{ new Double (dist ), new Integer (position-1) }); inserted++; } else { // yes // smallest value in the heap < current dist? if ( ((Double) ((Object []) heap.peek () )[0]).doubleValue() < dist ) { // yes // insert current distance heap.replace ( new Object[]{ new Double (dist ), new Integer (position-1) }); } } } previous = next; } // if ( heap.size() != n) throw new IllegalArgumentException ("Not enough data given. Check parameters!"); // int [] indices = new int [n]; int pos = 0; while (!heap.isEmpty()) { indices [pos++] = ((Integer) ((Object[]) heap.dequeue())[1]).intValue(); } heap.close(); return indices; } }