/* * DiscreteStatistics.java * * Copyright (c) 2002-2015 Alexei Drummond, Andrew Rambaut and Marc Suchard * * This file is part of BEAST. * See the NOTICE file distributed with this work for additional * information regarding copyright ownership and licensing. * * BEAST is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * BEAST is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with BEAST; if not, write to the * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, * Boston, MA 02110-1301 USA */ package dr.stats; import dr.util.HeapSort; /** * simple discrete statistics (mean, variance, cumulative probability, quantiles etc.) * * @author Korbinian Strimmer * @author Alexei Drummond * @version $Id: DiscreteStatistics.java,v 1.11 2006/07/02 21:14:53 rambaut Exp $ */ public class DiscreteStatistics { /** * compute mean * * @param x list of numbers * @return mean */ public static double mean(double[] x) { double m = 0; int count = x.length; for(double aX : x) { if( Double.isNaN(aX) ) { count--; } else { m += aX; } } return m / (double) count; } /** * compute median * * @param x list of numbers * @param indices index sorting x * @return median */ public static double median(double[] x, int[] indices) { int pos = x.length / 2; if (x.length % 2 == 1) { return x[indices[pos]]; } else { return (x[indices[pos - 1]] + x[indices[pos]]) / 2.0; } } /** * compute the mean squared error * * @param x list of numbers * @param trueValue truth * @return MSE */ public static double meanSquaredError(double[] x, double trueValue) { if (x == null || x.length == 0) { throw new IllegalArgumentException(); } double total = 0; for(double sample : x) { total += (sample - trueValue) * (sample - trueValue); } total /= x.length; return total; } /** * compute median * * @param x list of numbers * @return median */ public static double median(double[] x) { if (x == null || x.length == 0) { throw new IllegalArgumentException(); } int[] indices = new int[x.length]; HeapSort.sort(x, indices); return median(x, indices); } /** * compute variance (ML estimator) * * @param x list of numbers * @param mean assumed mean of x * @return variance of x (ML estimator) */ public static double variance(double[] x, double mean) { double var = 0; int count = x.length; for(double aX : x) { if( Double.isNaN(aX) ) { count--; } else { double diff = aX - mean; var += diff * diff; } } if (count < 2) { count = 1; // to avoid division by zero } else { count = count - 1; // for ML estimate } return var / (double) count; } /** * compute covariance * * @param x list of numbers * @param y list of numbers * @return covariance of x and y */ @SuppressWarnings({"SuspiciousNameCombination"}) public static double covariance(double[] x, double[] y) { return covariance(x, y, mean(x), mean(y), stdev(x), stdev(y)); } /** * compute covariance * * @param x list of numbers * @param y list of numbers * @param xmean assumed mean of x * @param ymean assumed mean of y * @param xstdev assumed stdev of x * @param ystdev assumed stdev of y * @return covariance of x and y */ public static double covariance(double[] x, double[] y, double xmean, double ymean, double xstdev, double ystdev) { if (x.length != y.length) throw new IllegalArgumentException("x and y arrays must be same length!"); int count = x.length; double covar = 0.0; for (int i = 0; i < x.length; i++) { if (Double.isNaN(x[i]) || Double.isNaN(y[i])) { count --; } else { covar += (x[i] - xmean) * (y[i] - ymean); } } covar /= count; covar /= (xstdev * ystdev); return covar; } /** * compute fisher skewness * * @param x list of numbers * @return skewness of x */ public static double skewness(double[] x) { double mean = mean(x); double stdev = stdev(x); double skew = 0.0; double len = x.length; for (double xv : x) { double diff = xv - mean; diff /= stdev; skew += (diff * diff * diff); } skew *= (len / ((len - 1) * (len - 2))); return skew; } /** * compute standard deviation * * @param x list of numbers * @return standard deviation of x */ public static double stdev(double[] x) { return Math.sqrt(variance(x)); } /** * compute variance (ML estimator) * * @param x list of numbers * @return variance of x (ML estimator) */ public static double variance(double[] x) { final double m = mean(x); return variance(x, m); } /** * compute variance of sample mean (ML estimator) * * @param x list of numbers * @param mean assumed mean of x * @return variance of x (ML estimator) */ public static double varianceSampleMean(double[] x, double mean) { return variance(x, mean) / (double) x.length; } /** * compute variance of sample mean (ML estimator) * * @param x list of numbers * @return variance of x (ML estimator) */ public static double varianceSampleMean(double[] x) { return variance(x) / (double) x.length; } /** * compute the q-th quantile for a distribution of x * (= inverse cdf) * * @param q quantile (0 < q <= 1) * @param x discrete distribution (an unordered list of numbers) * @param indices index sorting x * @return q-th quantile */ public static double quantile(double q, double[] x, int[] indices) { if (q < 0.0 || q > 1.0) throw new IllegalArgumentException("Quantile out of range"); if (q == 0.0) { // for q==0 we have to "invent" an entry smaller than the smallest x return x[indices[0]] - 1.0; } return x[indices[(int) Math.ceil(q * indices.length) - 1]]; } /** * compute the q-th quantile for a distribution of x * (= inverse cdf) * * @param q quantile (0 <= q <= 1) * @param x discrete distribution (an unordered list of numbers) * @return q-th quantile */ public static double quantile(double q, double[] x) { int[] indices = new int[x.length]; HeapSort.sort(x, indices); return quantile(q, x, indices); } /** * compute the q-th quantile for a distribution of x * (= inverse cdf) * * @param q quantile (0 <= q <= 1) * @param x discrete distribution (an unordered list of numbers) * @param count use only first count entries in x * @return q-th quantile */ public static double quantile(double q, double[] x, int count) { int[] indices = new int[count]; HeapSort.sort(x, indices); return quantile(q, x, indices); } /** * Determine the highest posterior density for a list of values. * The HPD is the smallest interval containing the required amount of elements. * * @param proportion of elements inside the interval * @param x values * @param indices index sorting x * @return the interval, an array of {low, high} values. */ public static double[] HPDInterval(double proportion, double[] x, int[] indices) { double minRange = Double.MAX_VALUE; int hpdIndex = 0; final int diff = (int) Math.round(proportion * (double) x.length); for (int i = 0; i <= (x.length - diff); i++) { final double minValue = x[indices[i]]; final double maxValue = x[indices[i + diff - 1]]; final double range = Math.abs(maxValue - minValue); if (range < minRange) { minRange = range; hpdIndex = i; } } return new double[]{x[indices[hpdIndex]], x[indices[hpdIndex + diff - 1]]}; } /** * compute the cumulative probability Pr(x <= z) for a given z * and a distribution of x * * @param z threshold value * @param x discrete distribution (an unordered list of numbers) * @param indices index sorting x * @return cumulative probability */ public static double cdf(double z, double[] x, int[] indices) { int i; for (i = 0; i < x.length; i++) { if (x[indices[i]] > z) break; } return (double) i / (double) x.length; } /** * compute the cumulative probability Pr(x <= z) for a given z * and a distribution of x * * @param z threshold value * @param x discrete distribution (an unordered list of numbers) * @return cumulative probability */ public static double cdf(double z, double[] x) { int[] indices = new int[x.length]; HeapSort.sort(x, indices); return cdf(z, x, indices); } public static double max(double[] x) { double max = x[0]; for (int i = 1; i < x.length; i++) { if (x[i] > max) max = x[i]; } return max; } public static double min(double[] x) { double min = x[0]; for (int i = 1; i < x.length; i++) { if (x[i] < min) min = x[i]; } return min; } public static double geometricMean(double[] x) { double gm = 0; int len = x.length; for (int i = 0; i < len; i++) { gm += Math.log(x[i]); } return Math.exp(gm/(double) len); } }