package gov.nih.ncgc.bard.capextract; import jdistlib.disttest.NormalityTest; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; /** * @author Rajarshi Guha */ public class ResultHistogram { protected Logger log; final Float MAX_VALUE = 1e38f; Long currentEid = -1l; public ResultHistogram() { log = LoggerFactory.getLogger(this.getClass()); } public void generateHistogram(Long bardExptId) throws SQLException { boolean useLog = false; currentEid = bardExptId; Connection conn = CAPUtil.connectToBARD(CAPConstants.getBardDBJDBCUrl()); // first pull result types for this experiment PreparedStatement pst = conn.prepareStatement("select distinct display_name from exploded_results where bard_expt_id = ?"); pst.setLong(1, bardExptId); List<String> resultTypes = new ArrayList<String>(); ResultSet rs = pst.executeQuery(); while (rs.next()) resultTypes.add(rs.getString(1)); rs.close(); pst.close(); // for each result type, delete pre-existing histogram, make new histogram and insert it for (String resultType : resultTypes) { pst = conn.prepareStatement("select value from exploded_results where bard_expt_id = ? and display_name = ?"); pst.setLong(1, bardExptId); pst.setString(2, resultType); rs = pst.executeQuery(); List<Float> values = new ArrayList<Float>(); while (rs.next()) values.add(rs.getFloat(1)); rs.close(); pst.close(); // get the log10 version of the values - we remove // values <= 0 since they would not allow us to // consider the data as a lognormal distribution List<Float> vals = new ArrayList<Float>(); for (Float value : values) { if (value > 0) vals.add((float) Math.log10(value)); } double[] dvals = new double[vals.size()]; for (int i = 0; i < vals.size(); i++) dvals[i] = vals.get(i); double statistic = NormalityTest.anderson_darling_statistic(dvals); double pvalue = NormalityTest.anderson_darling_pvalue(statistic, dvals.length); if (Double.isNaN(pvalue) || pvalue > 0.05) { // log10 data is normal, so replace original values with the log10 values log.info("BARD experiment id " + bardExptId + "/" + resultType + " is lognormal. Histogramming log10 values"); values = vals; useLog = true; } List<HBin> bins = calculateHistogram(values); // get rid of pre-existing histogram pst = conn.prepareStatement("delete from exploded_histograms where bard_expt_id = ? and display_name = ?"); pst.setLong(1, bardExptId); pst.setString(2, resultType); pst.executeUpdate(); pst.close(); // put in new histogram pst = conn.prepareStatement("insert into exploded_histograms (bard_expt_id, display_name, l, u, n) values (?,?,?,?,?)"); for (HBin bin : bins) { pst.setLong(1, bardExptId); pst.setString(2, resultType); if (useLog) { pst.setFloat(3, (float) Math.pow(10, bin.l)); pst.setFloat(4, (float) Math.pow(10, bin.u)); } else { pst.setFloat(3, bin.l); pst.setFloat(4, bin.u); } pst.setInt(5, bin.c); pst.executeUpdate(); pst.clearParameters(); } pst.close(); } conn.commit(); conn.close(); } List<HBin> calculateHistogram(List<Float> values) { List<HBin> bins = new ArrayList<HBin>(); if (values.size() < 3) return bins; float sd = CAPUtil.sd(values); if (sd == 0) return bins; // currently we use Scotts method to get optimal bin width/bin count float h = (float) (3.5 * sd / Math.pow((double) values.size(), 1.0 / 3.0)); // bin width float maxval = Float.MIN_VALUE; float minval = Float.MAX_VALUE; for (Float v : values) { if (v > maxval) maxval = v; if (v < minval) minval = v; } float k = (maxval - minval) / h; // number of bins log.info("EID " + currentEid + ": sd = " + sd + ", h = " + h + ", k = " + k + " (will use " + (int) Math.ceil(k) + " bins)"); // given the number of bins from the bin width algo, we make a set of pretty breaks // based on pretty.R and pretty.c from R 3.0.1 double[] prettyBounds = pretty(minval, maxval, (int) Math.ceil(k), 1, true); minval = (float) prettyBounds[0]; maxval = (float) prettyBounds[1]; // recalculate h based on pretty bounds h = (float) ((maxval - minval) / (prettyBounds[2])); float lower = minval; int nbin = (int) prettyBounds[2] + 1; for (int i = 1; i <= nbin; i++) { HBin bin = new HBin(lower, lower + h, 0); bins.add(bin); lower += h; } // raw breaks - not pretty // float lower = minval; // for (int i = 1; i <= Math.floor(k)+1; i++) { // HBin bin = new HBin(lower, lower + h, 0); // bins.add(bin); // lower += h; // } for (Float v : values) { for (HBin bin : bins) { if (v >= bin.l && v < bin.u) { bin.c++; break; } } } // remove empty bins from the tail of the distribution List<Integer> removeIdx = new ArrayList<Integer>(); for (int i = bins.size() - 1; i >= 0; i--) { if (bins.get(i).c > 0) break; else removeIdx.add(i); } List<HBin> cleanBins = new ArrayList<HBin>(); for (int i = 0; i < bins.size(); i++) { if (!removeIdx.contains(i)) cleanBins.add(bins.get(i)); } return cleanBins; } class HBin { float l, u; int c; HBin(float l, float u, int c) { this.l = l; this.u = u; this.c = c; } } private static float calculateMachineEpsilonFloat() { float machEps = 1.0f; do machEps /= 2.0f; while ((float) (1.0 + (machEps / 2.0)) != 1.0); return machEps; } double[] pretty(double lo, double up, int ndiv, int min_n, boolean return_bounds) { double dx, cell, unit, base, U; double ns, nu; int k; boolean i_small; double shrink_sml = 0.75; double h = 1.5; double h5 = .5 + 1.5 * h; int eps_correction = 0; double rounding_eps = 1e-7; double DBL_EPSILON = calculateMachineEpsilonFloat(); dx = up - lo; if (dx == 0 && up == 0) { /* up == lo == 0 */ cell = 1; i_small = true; } else { cell = Math.max(Math.abs(lo), Math.abs(up)); /* U = upper bound on cell/unit */ // U = (1 + (h5 >= 1.5*h+.5)) ? 1/(1+h) : 1.5/(1+h5); U = 1 / (1 + h); /* added times 3, as several calculations here */ i_small = dx < cell * U * Math.max(1, ndiv) * DBL_EPSILON * 3; } if (i_small) { if (cell > 10) cell = 9 + cell / 10; cell *= shrink_sml; if (min_n > 1) cell /= min_n; } else { cell = dx; if (ndiv > 1) cell /= ndiv; } if (cell < 20 * Double.MIN_VALUE) { cell = 20 * Double.MIN_VALUE; } else if (cell * 10 > Double.MAX_VALUE) { cell = .1 * Double.MAX_VALUE; } base = Math.pow(10., Math.floor(Math.log10(cell))); /* base <= cell < 10*base */ unit = base; if ((U = 2 * base) - cell < h * (cell - unit)) { unit = U; if ((U = 5 * base) - cell < h5 * (cell - unit)) { unit = U; if ((U = 10 * base) - cell < h * (cell - unit)) unit = U; } } ns = Math.floor(lo / unit + rounding_eps); nu = Math.ceil(up / unit - rounding_eps); if (eps_correction == 1 && (eps_correction > 1 || !i_small)) { if (lo != 0) lo *= (1 - DBL_EPSILON); else lo = -Double.MIN_VALUE; if (up != 0) up *= (1 + DBL_EPSILON); else up = +Double.MIN_VALUE; } while (ns * unit > lo + rounding_eps * unit) ns--; while (nu * unit < up - rounding_eps * unit) nu++; k = (int) (0.5 + nu - ns); if (k < min_n) { k = min_n - k; if (ns >= 0.) { nu += k / 2; ns -= k / 2 + k % 2;/* ==> nu-ns = old(nu-ns) + min_n -k = min_n */ } else { ns -= k / 2; nu += k / 2 + k % 2; } ndiv = min_n; } else { ndiv = k; } if (return_bounds) { /* if()'s to ensure that result covers original range */ if (ns * unit < lo) lo = ns * unit; if (nu * unit > up) up = nu * unit; } else { lo = ns; up = nu; } return new double[]{lo, up, ndiv}; } public static void main(String[] args) throws SQLException { ResultHistogram r = new ResultHistogram(); r.generateHistogram(320L); // Connection conn = CAPUtil.connectToBARD(CAPConstants.getBardDBJDBCUrl()); // PreparedStatement pst = conn.prepareStatement("select distinct bard_expt_id from exploded_results"); // ResultSet rs = pst.executeQuery(); // while (rs.next()) r.generateHistogram(rs.getLong(1)); // conn.close(); } }