/* hypothesis.h: A collection of quantile and quadrature routines for Z, Chi^2, and Student's T hypothesis tests. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #pragma once #include #include #include #include #include #include #include #include "cephes.h" namespace hypothesis { /// Cumulative distribution function of the standard normal distribution inline double stdnormal_cdf(double x) { return std::erfc(-x/std::sqrt(2.0))*0.5; } /// Cumulative distribution function of the Chi^2 distribution inline double chi2_cdf(double x, int dof) { if (dof < 1 || x < 0) { return 0.0; } else if (dof == 2) { return 1.0 - std::exp(-0.5*x); } else { return cephes::rlgamma(0.5 * dof, 0.5 * x); } } /// Cumulative distribution function of Student's T distribution inline double students_t_cdf(double x, int dof) { if (x > 0) return 1-0.5*cephes::incbet(dof * 0.5, 0.5, dof/(x*x+dof)); else return 0.5*cephes::incbet(dof * 0.5, 0.5, dof/(x*x+dof)); } /// adaptive Simpson integration over an 1D interval inline double adaptiveSimpson(const std::function &f, double x0, double x1, double eps = 1e-6, int depth = 6) { int count = 0; /* Define an recursive lambda function for integration over subintervals */ std::function integrate = [&](double a, double b, double c, double fa, double fb, double fc, double I, double eps, int depth) { /* Evaluate the function at two intermediate points */ double d = 0.5 * (a + b), e = 0.5 * (b + c), fd = f(d), fe = f(e); /* Simpson integration over each subinterval */ double h = c-a, I0 = (1.0/12.0) * h * (fa + 4.0*fd + fb), I1 = (1.0/12.0) * h * (fb + 4.0*fe + fc), Ip = I0+I1; ++count; /* Stopping criterion from J.N. Lyness (1969) "Notes on the adaptive Simpson quadrature routine" */ if (depth <= 0 || std::abs(Ip-I) < 15.0*eps) { // Richardson extrapolation return Ip + (1.0/15.0) * (Ip-I); } return integrate(a, d, b, fa, fd, fb, I0, 0.5*eps, depth-1) + integrate(b, e, c, fb, fe, fc, I1, 0.5*eps, depth-1); }; double a = x0, b = 0.5 * (x0+x1), c = x1; double fa = f(a), fb = f(b), fc = f(c); double I = (c-a) * (1.0/6.0) * (fa+4.0*fb+fc); return integrate(a, b, c, fa, fb, fc, I, eps, depth); } /// Nested adaptive Simpson integration over a 2D rectangle inline double adaptiveSimpson2D(const std::function &f, double x0, double y0, double x1, double y1, double eps = 1e-6, int depth = 6) { /* Lambda function that integrates over the X axis */ auto integrate = [&](double y) { return adaptiveSimpson(std::bind(f, std::placeholders::_1, y), x0, x1, eps, depth); }; double value = adaptiveSimpson(integrate, y0, y1, eps, depth); return value; } /** * Peform a Chi^2 test based on the given frequency tables * * \param nCells * Total number of table cells * * \param obsFrequencies * Observed cell frequencies in each cell * * \param expFrequencies * Integrated cell frequencies in each cell (i.e. the noise-free reference) * * \param sampleCount * Total observed sample count * * \param minExpFrequency * Minimum expected cell frequency. The chi^2 test does not work reliably * when the expected frequency in a cell is low (e.g. less than 5), because * normality assumptions break down in this case. Therefore, the * implementation will merge such low-frequency cells when they fall below * the threshold specified here. * * \param significanceLevel * The null hypothesis will be rejected when the associated * p-value is below the significance level specified here. * * \param numTests * Specifies the total number of tests that will be executed. If greater than one, * the Sidak correction will be applied to the significance level. This is because * by conducting multiple independent hypothesis tests in sequence, the probability * of a failure increases accordingly. * * \return * A pair of values containing the test result (success: \c true and failure: \c false) * and a descriptive string */ inline std::pair chi2_test( int nCells, const double *obsFrequencies, const double *expFrequencies, int sampleCount, double minExpFrequency, double significanceLevel, int numTests = 1) { struct Cell { double expFrequency; size_t index; }; /* Sort all cells by their expected frequencies */ std::vector cells(nCells); for (size_t i=0; i sampleCount * 1e-5) { /* Uh oh: samples in a cell that should be completely empty according to the probability density function. Ordinarily, even a single sample requires immediate rejection of the null hypothesis. But due to finite-precision computations and rounding errors, this can occasionally happen without there being an actual bug. Therefore, the criterion here is a bit more lenient. */ oss << "Encountered " << obsFrequencies[c.index] << " samples in a cell " << "with expected frequency 0. Rejecting the null hypothesis!" << std::endl; return std::make_pair(false, oss.str()); } } else if (expFrequencies[c.index] < minExpFrequency) { /* Pool cells with low expected frequencies */ pooledFrequencies += obsFrequencies[c.index]; pooledExpFrequencies += expFrequencies[c.index]; pooledCells++; } else if (pooledExpFrequencies > 0 && pooledExpFrequencies < minExpFrequency) { /* Keep on pooling cells until a sufficiently high expected frequency is achieved. */ pooledFrequencies += obsFrequencies[c.index]; pooledExpFrequencies += expFrequencies[c.index]; pooledCells++; } else { double diff = obsFrequencies[c.index] - expFrequencies[c.index]; chsq += (diff*diff) / expFrequencies[c.index]; ++dof; } } if (pooledExpFrequencies > 0 || pooledFrequencies > 0) { oss << "Pooled " << pooledCells << " to ensure sufficiently high expected " "cell frequencies (>" << minExpFrequency << ")" << std::endl; double diff = pooledFrequencies - pooledExpFrequencies; chsq += (diff*diff) / pooledExpFrequencies; ++dof; } /* All parameters are assumed to be known, so there is no additional DF reduction due to model parameters */ dof -= 1; if (dof <= 0) { oss << "The number of degrees of freedom (" << dof << ") is too low!" << std::endl; return std::make_pair(false, oss.str()); } oss << "Chi^2 statistic = " << chsq << " (d.o.f. = " << dof << ")" << std::endl; /* Probability of obtaining a test statistic at least as extreme as the one observed under the assumption that the distributions match */ double pval = 1 - (double) chi2_cdf(chsq, dof); /* Apply the Sidak correction term, since we'll be conducting multiple independent hypothesis tests. This accounts for the fact that the probability of a failure increases quickly when several hypothesis tests are run in sequence. */ double alpha = 1.0 - std::pow(1.0 - significanceLevel, 1.0 / numTests); bool result = false; if (pval < alpha || !std::isfinite(pval)) { oss << "***** Rejected ***** the null hypothesis (p-value = " << pval << ", " "significance level = " << alpha << ")" << std::endl; } else { oss << "Accepted the null hypothesis (p-value = " << pval << ", " "significance level = " << alpha << ")" << std::endl; result = true; } return std::make_pair(result, oss.str()); } /// Write 2D Chi^2 frequency tables to disk in a format that is nicely plottable by Octave and MATLAB inline void chi2_dump(int res1, int res2, const double *obsFrequencies, const double *expFrequencies, const std::string &filename) { std::ofstream f(filename); f << "obsFrequencies = [ "; for (int i=0; i students_t_test(double mean, double variance, double reference, int sampleCount, double significanceLevel, int numTests) { std::ostringstream oss; /* Compute the t statistic */ double t = std::abs(mean - reference) * std::sqrt(sampleCount / std::max(variance, 1e-5)); /* Determine the degrees of freedom, and instantiate a matching distribution object */ int dof = sampleCount - 1; oss << "Sample mean = " << mean << " (reference value = " << reference << ")" << std::endl; oss << "Sample variance = " << variance << std::endl; oss << "t-statistic = " << t << " (d.o.f. = " << dof << ")" << std::endl; /* Compute the p-value */ double pval = 2 * (1 - students_t_cdf(t, dof)); /* Apply the Sidak correction term, since we'll be conducting multiple independent hypothesis tests. This accounts for the fact that the probability of a failure increases quickly when several hypothesis tests are run in sequence. */ double alpha = 1.0 - std::pow(1.0 - significanceLevel, 1.0 / numTests); bool result = false; if (pval < alpha) { oss << "***** Rejected ***** the null hypothesis (p-value = " << pval << ", " "significance level = " << alpha << ")" << std::endl; } else { oss << "Accepted the null hypothesis (p-value = " << pval << ", " "significance level = " << alpha << ")" << std::endl; result = true; } return std::make_pair(result, oss.str()); } }; /* namespace hypothesis */