Disabled external gits
This commit is contained in:
21
cs440-acg/ext/hypothesis/LICENSE
Normal file
21
cs440-acg/ext/hypothesis/LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the <organization> nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
7
cs440-acg/ext/hypothesis/README.md
Normal file
7
cs440-acg/ext/hypothesis/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# hypothesis.h
|
||||
## A collection of quantiles and utility functions for running Z, Chi^2, and Student's T hypothesis tests
|
||||
|
||||
A variety of quantile functions are needed to perform statistical hypothesis
|
||||
tests, but these are missing from the C++ standard library. This compact header
|
||||
file-only library contains the most important quantiles; it is mostly a wrapper
|
||||
around a C++ port of the relevant functions from the Cephes math library.
|
404
cs440-acg/ext/hypothesis/cephes.h
Normal file
404
cs440-acg/ext/hypothesis/cephes.h
Normal file
@@ -0,0 +1,404 @@
|
||||
/*
|
||||
cephes.h: A subset of cephes math routines used by hypothesis.h
|
||||
|
||||
Redistributed under the BSD license with permission of the author, see
|
||||
https://github.com/deepmind/torch-cephes/blob/master/LICENSE.txt
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the <organization> nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cmath>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace cephes {
|
||||
static const double biginv = 2.22044604925031308085e-16;
|
||||
static const double big = 4.503599627370496e15;
|
||||
static const double MAXGAM = 171.624376956302725;
|
||||
static const double MACHEP = 1.11022302462515654042E-16;
|
||||
static const double MAXLOG = 7.09782712893383996843E2;
|
||||
static const double MINLOG = -7.08396418532264106224E2;
|
||||
|
||||
/* Forward declarations */
|
||||
static double pseries(double a, double b, double x);
|
||||
static double incbd(double a, double b, double x);
|
||||
static double incbcf(double a, double b, double x);
|
||||
|
||||
inline double incbet(double aa, double bb, double xx) {
|
||||
double a, b, t, x, xc, w, y;
|
||||
int flag;
|
||||
|
||||
if (aa <= 0.0 || bb <= 0.0)
|
||||
goto domerr;
|
||||
|
||||
if ((xx <= 0.0) || (xx >= 1.0)) {
|
||||
if (xx == 0.0)
|
||||
return 0.0;
|
||||
if (xx == 1.0)
|
||||
return 1.0;
|
||||
domerr:
|
||||
throw std::runtime_error("incbet: domain error!");
|
||||
}
|
||||
|
||||
flag = 0;
|
||||
if ((bb * xx) <= 1.0 && xx <= 0.95) {
|
||||
t = pseries(aa, bb, xx);
|
||||
goto done;
|
||||
}
|
||||
|
||||
w = 1.0 - xx;
|
||||
|
||||
/* Reverse a and b if x is greater than the mean. */
|
||||
if (xx > (aa / (aa + bb))) {
|
||||
flag = 1;
|
||||
a = bb;
|
||||
b = aa;
|
||||
xc = xx;
|
||||
x = w;
|
||||
} else {
|
||||
a = aa;
|
||||
b = bb;
|
||||
xc = w;
|
||||
x = xx;
|
||||
}
|
||||
|
||||
if (flag == 1 && (b * x) <= 1.0 && x <= 0.95) {
|
||||
t = pseries(a, b, x);
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* Choose expansion for better convergence. */
|
||||
y = x * (a + b - 2.0) - (a - 1.0);
|
||||
if (y < 0.0)
|
||||
w = incbcf(a, b, x);
|
||||
else
|
||||
w = incbd(a, b, x) / xc;
|
||||
|
||||
/* Multiply w by the factor
|
||||
a b _ _ _
|
||||
x (1-x) | (a+b) / ( a | (a) | (b) ) . */
|
||||
|
||||
y = a * std::log(x);
|
||||
t = b * std::log(xc);
|
||||
if ((a + b) < MAXGAM && std::abs(y) < MAXLOG && std::abs(t) < MAXLOG) {
|
||||
t = pow(xc, b);
|
||||
t *= pow(x, a);
|
||||
t /= a;
|
||||
t *= w;
|
||||
t *= std::tgamma(a + b) / (std::tgamma(a) * std::tgamma(b));
|
||||
goto done;
|
||||
}
|
||||
/* Resort to logarithms. */
|
||||
y += t + std::lgamma(a + b) - std::lgamma(a) - std::lgamma(b);
|
||||
y += std::log(w / a);
|
||||
if (y < MINLOG)
|
||||
t = 0.0;
|
||||
else
|
||||
t = std::exp(y);
|
||||
|
||||
done:
|
||||
|
||||
if (flag == 1) {
|
||||
if (t <= MACHEP)
|
||||
t = 1.0 - MACHEP;
|
||||
else
|
||||
t = 1.0 - t;
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
/* Continued fraction expansion #1
|
||||
* for incomplete beta integral
|
||||
*/
|
||||
inline static double incbcf(double a, double b, double x) {
|
||||
double xk, pk, pkm1, pkm2, qk, qkm1, qkm2;
|
||||
double k1, k2, k3, k4, k5, k6, k7, k8;
|
||||
double r, t, ans, thresh;
|
||||
int n;
|
||||
|
||||
k1 = a;
|
||||
k2 = a + b;
|
||||
k3 = a;
|
||||
k4 = a + 1.0;
|
||||
k5 = 1.0;
|
||||
k6 = b - 1.0;
|
||||
k7 = k4;
|
||||
k8 = a + 2.0;
|
||||
|
||||
pkm2 = 0.0;
|
||||
qkm2 = 1.0;
|
||||
pkm1 = 1.0;
|
||||
qkm1 = 1.0;
|
||||
ans = 1.0;
|
||||
r = 1.0;
|
||||
n = 0;
|
||||
thresh = 3.0 * MACHEP;
|
||||
do {
|
||||
|
||||
xk = -(x * k1 * k2) / (k3 * k4);
|
||||
pk = pkm1 + pkm2 * xk;
|
||||
qk = qkm1 + qkm2 * xk;
|
||||
pkm2 = pkm1;
|
||||
pkm1 = pk;
|
||||
qkm2 = qkm1;
|
||||
qkm1 = qk;
|
||||
|
||||
xk = (x * k5 * k6) / (k7 * k8);
|
||||
pk = pkm1 + pkm2 * xk;
|
||||
qk = qkm1 + qkm2 * xk;
|
||||
pkm2 = pkm1;
|
||||
pkm1 = pk;
|
||||
qkm2 = qkm1;
|
||||
qkm1 = qk;
|
||||
|
||||
if (qk != 0)
|
||||
r = pk / qk;
|
||||
if (r != 0) {
|
||||
t = std::abs((ans - r) / r);
|
||||
ans = r;
|
||||
} else
|
||||
t = 1.0;
|
||||
|
||||
if (t < thresh)
|
||||
goto cdone;
|
||||
|
||||
k1 += 1.0;
|
||||
k2 += 1.0;
|
||||
k3 += 2.0;
|
||||
k4 += 2.0;
|
||||
k5 += 1.0;
|
||||
k6 -= 1.0;
|
||||
k7 += 2.0;
|
||||
k8 += 2.0;
|
||||
|
||||
if ((std::abs(qk) + std::abs(pk)) > big) {
|
||||
pkm2 *= biginv;
|
||||
pkm1 *= biginv;
|
||||
qkm2 *= biginv;
|
||||
qkm1 *= biginv;
|
||||
}
|
||||
if ((std::abs(qk) < biginv) || (std::abs(pk) < biginv)) {
|
||||
pkm2 *= big;
|
||||
pkm1 *= big;
|
||||
qkm2 *= big;
|
||||
qkm1 *= big;
|
||||
}
|
||||
} while (++n < 300);
|
||||
|
||||
cdone:
|
||||
return (ans);
|
||||
}
|
||||
|
||||
/* Continued fraction expansion #2
|
||||
* for incomplete beta integral
|
||||
*/
|
||||
inline static double incbd(double a, double b, double x) {
|
||||
double xk, pk, pkm1, pkm2, qk, qkm1, qkm2;
|
||||
double k1, k2, k3, k4, k5, k6, k7, k8;
|
||||
double r, t, ans, z, thresh;
|
||||
int n;
|
||||
|
||||
k1 = a;
|
||||
k2 = b - 1.0;
|
||||
k3 = a;
|
||||
k4 = a + 1.0;
|
||||
k5 = 1.0;
|
||||
k6 = a + b;
|
||||
k7 = a + 1.0;
|
||||
k8 = a + 2.0;
|
||||
|
||||
pkm2 = 0.0;
|
||||
qkm2 = 1.0;
|
||||
pkm1 = 1.0;
|
||||
qkm1 = 1.0;
|
||||
z = x / (1.0 - x);
|
||||
ans = 1.0;
|
||||
r = 1.0;
|
||||
n = 0;
|
||||
thresh = 3.0 * MACHEP;
|
||||
do {
|
||||
|
||||
xk = -(z * k1 * k2) / (k3 * k4);
|
||||
pk = pkm1 + pkm2 * xk;
|
||||
qk = qkm1 + qkm2 * xk;
|
||||
pkm2 = pkm1;
|
||||
pkm1 = pk;
|
||||
qkm2 = qkm1;
|
||||
qkm1 = qk;
|
||||
|
||||
xk = (z * k5 * k6) / (k7 * k8);
|
||||
pk = pkm1 + pkm2 * xk;
|
||||
qk = qkm1 + qkm2 * xk;
|
||||
pkm2 = pkm1;
|
||||
pkm1 = pk;
|
||||
qkm2 = qkm1;
|
||||
qkm1 = qk;
|
||||
|
||||
if (qk != 0)
|
||||
r = pk / qk;
|
||||
if (r != 0) {
|
||||
t = std::abs((ans - r) / r);
|
||||
ans = r;
|
||||
} else
|
||||
t = 1.0;
|
||||
|
||||
if (t < thresh)
|
||||
goto cdone;
|
||||
|
||||
k1 += 1.0;
|
||||
k2 -= 1.0;
|
||||
k3 += 2.0;
|
||||
k4 += 2.0;
|
||||
k5 += 1.0;
|
||||
k6 += 1.0;
|
||||
k7 += 2.0;
|
||||
k8 += 2.0;
|
||||
|
||||
if ((std::abs(qk) + std::abs(pk)) > big) {
|
||||
pkm2 *= biginv;
|
||||
pkm1 *= biginv;
|
||||
qkm2 *= biginv;
|
||||
qkm1 *= biginv;
|
||||
}
|
||||
if ((std::abs(qk) < biginv) || (std::abs(pk) < biginv)) {
|
||||
pkm2 *= big;
|
||||
pkm1 *= big;
|
||||
qkm2 *= big;
|
||||
qkm1 *= big;
|
||||
}
|
||||
} while (++n < 300);
|
||||
cdone:
|
||||
return (ans);
|
||||
}
|
||||
|
||||
/* Power series for incomplete beta integral.
|
||||
Use when b*x is small and x not too close to 1. */
|
||||
inline static double pseries(double a, double b, double x) {
|
||||
double s, t, u, v, n, t1, z, ai;
|
||||
|
||||
ai = 1.0 / a;
|
||||
u = (1.0 - b) * x;
|
||||
v = u / (a + 1.0);
|
||||
t1 = v;
|
||||
t = u;
|
||||
n = 2.0;
|
||||
s = 0.0;
|
||||
z = MACHEP * ai;
|
||||
while (std::abs(v) > z) {
|
||||
u = (n - b) * x / n;
|
||||
t *= u;
|
||||
v = t / (a + n);
|
||||
s += v;
|
||||
n += 1.0;
|
||||
}
|
||||
s += t1;
|
||||
s += ai;
|
||||
|
||||
u = a * std::log(x);
|
||||
if ((a + b) < MAXGAM && std::abs(u) < MAXLOG) {
|
||||
t = std::tgamma(a + b) / (std::tgamma(a) * std::tgamma(b));
|
||||
s = s * t * pow(x, a);
|
||||
} else {
|
||||
t = std::lgamma(a + b) - std::lgamma(a) - std::lgamma(b) + u + std::log(s);
|
||||
if (t < MINLOG)
|
||||
s = 0.0;
|
||||
else
|
||||
s = std::exp(t);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
/// Regularized lower incomplete gamma function
|
||||
inline double rlgamma(double a, double x) {
|
||||
const double epsilon = 0.000000000000001;
|
||||
|
||||
if (a < 0 || x < 0)
|
||||
throw std::runtime_error("LLGamma: invalid arguments range!");
|
||||
|
||||
if (x == 0)
|
||||
return 0.0;
|
||||
|
||||
double ax = (a * std::log(x)) - x - std::lgamma(a);
|
||||
if (ax < -709.78271289338399)
|
||||
return a < x ? 1.0 : 0.0;
|
||||
|
||||
if (x <= 1 || x <= a) {
|
||||
double r2 = a;
|
||||
double c2 = 1;
|
||||
double ans2 = 1;
|
||||
|
||||
do {
|
||||
r2 = r2 + 1;
|
||||
c2 = c2 * x / r2;
|
||||
ans2 += c2;
|
||||
} while ((c2 / ans2) > epsilon);
|
||||
|
||||
return std::exp(ax) * ans2 / a;
|
||||
}
|
||||
|
||||
int c = 0;
|
||||
double y = 1 - a;
|
||||
double z = x + y + 1;
|
||||
double p3 = 1;
|
||||
double q3 = x;
|
||||
double p2 = x + 1;
|
||||
double q2 = z * x;
|
||||
double ans = p2 / q2;
|
||||
double error;
|
||||
|
||||
do {
|
||||
c++;
|
||||
y += 1;
|
||||
z += 2;
|
||||
double yc = y * c;
|
||||
double p = (p2 * z) - (p3 * yc);
|
||||
double q = (q2 * z) - (q3 * yc);
|
||||
|
||||
if (q != 0) {
|
||||
double nextans = p / q;
|
||||
error = std::abs((ans - nextans) / nextans);
|
||||
ans = nextans;
|
||||
} else {
|
||||
// zero div, skip
|
||||
error = 1;
|
||||
}
|
||||
|
||||
// shift
|
||||
p3 = p2;
|
||||
p2 = p;
|
||||
q3 = q2;
|
||||
q2 = q;
|
||||
|
||||
// normalize fraction when the numerator becomes large
|
||||
if (std::abs(p) > big) {
|
||||
p3 *= biginv;
|
||||
p2 *= biginv;
|
||||
q3 *= biginv;
|
||||
q2 *= biginv;
|
||||
}
|
||||
} while (error > epsilon);
|
||||
|
||||
return 1.0 - (std::exp(ax) * ans);
|
||||
}
|
||||
};
|
355
cs440-acg/ext/hypothesis/hypothesis.h
Normal file
355
cs440-acg/ext/hypothesis/hypothesis.h
Normal file
@@ -0,0 +1,355 @@
|
||||
/*
|
||||
hypothesis.h: A collection of quantile and quadrature routines
|
||||
for Z, Chi^2, and Student's T hypothesis tests.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the <organization> nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
#include "cephes.h"
|
||||
|
||||
namespace hypothesis {
|
||||
/// Cumulative distribution function of the standard normal distribution
|
||||
inline double stdnormal_cdf(double x) {
|
||||
return std::erfc(-x/std::sqrt(2.0))*0.5;
|
||||
}
|
||||
|
||||
/// Cumulative distribution function of the Chi^2 distribution
|
||||
inline double chi2_cdf(double x, int dof) {
|
||||
if (dof < 1 || x < 0) {
|
||||
return 0.0;
|
||||
} else if (dof == 2) {
|
||||
return 1.0 - std::exp(-0.5*x);
|
||||
} else {
|
||||
return cephes::rlgamma(0.5 * dof, 0.5 * x);
|
||||
}
|
||||
}
|
||||
|
||||
/// Cumulative distribution function of Student's T distribution
|
||||
inline double students_t_cdf(double x, int dof) {
|
||||
if (x > 0)
|
||||
return 1-0.5*cephes::incbet(dof * 0.5, 0.5, dof/(x*x+dof));
|
||||
else
|
||||
return 0.5*cephes::incbet(dof * 0.5, 0.5, dof/(x*x+dof));
|
||||
}
|
||||
|
||||
/// adaptive Simpson integration over an 1D interval
|
||||
inline double adaptiveSimpson(const std::function<double (double)> &f, double x0, double x1, double eps = 1e-6, int depth = 6) {
|
||||
int count = 0;
|
||||
/* Define an recursive lambda function for integration over subintervals */
|
||||
std::function<double (double, double, double, double, double, double, double, double, int)> integrate =
|
||||
[&](double a, double b, double c, double fa, double fb, double fc, double I, double eps, int depth) {
|
||||
/* Evaluate the function at two intermediate points */
|
||||
double d = 0.5 * (a + b), e = 0.5 * (b + c), fd = f(d), fe = f(e);
|
||||
|
||||
/* Simpson integration over each subinterval */
|
||||
double h = c-a,
|
||||
I0 = (1.0/12.0) * h * (fa + 4.0*fd + fb),
|
||||
I1 = (1.0/12.0) * h * (fb + 4.0*fe + fc),
|
||||
Ip = I0+I1;
|
||||
++count;
|
||||
|
||||
/* Stopping criterion from J.N. Lyness (1969)
|
||||
"Notes on the adaptive Simpson quadrature routine" */
|
||||
if (depth <= 0 || std::abs(Ip-I) < 15.0*eps) {
|
||||
// Richardson extrapolation
|
||||
return Ip + (1.0/15.0) * (Ip-I);
|
||||
}
|
||||
|
||||
return integrate(a, d, b, fa, fd, fb, I0, 0.5*eps, depth-1) +
|
||||
integrate(b, e, c, fb, fe, fc, I1, 0.5*eps, depth-1);
|
||||
};
|
||||
double a = x0, b = 0.5 * (x0+x1), c = x1;
|
||||
double fa = f(a), fb = f(b), fc = f(c);
|
||||
double I = (c-a) * (1.0/6.0) * (fa+4.0*fb+fc);
|
||||
return integrate(a, b, c, fa, fb, fc, I, eps, depth);
|
||||
}
|
||||
|
||||
/// Nested adaptive Simpson integration over a 2D rectangle
|
||||
inline double adaptiveSimpson2D(const std::function<double (double, double)> &f, double x0, double y0,
|
||||
double x1, double y1, double eps = 1e-6, int depth = 6) {
|
||||
/* Lambda function that integrates over the X axis */
|
||||
auto integrate = [&](double y) {
|
||||
return adaptiveSimpson(std::bind(f, std::placeholders::_1, y), x0, x1, eps, depth);
|
||||
};
|
||||
double value = adaptiveSimpson(integrate, y0, y1, eps, depth);
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Peform a Chi^2 test based on the given frequency tables
|
||||
*
|
||||
* \param nCells
|
||||
* Total number of table cells
|
||||
*
|
||||
* \param obsFrequencies
|
||||
* Observed cell frequencies in each cell
|
||||
*
|
||||
* \param expFrequencies
|
||||
* Integrated cell frequencies in each cell (i.e. the noise-free reference)
|
||||
*
|
||||
* \param sampleCount
|
||||
* Total observed sample count
|
||||
*
|
||||
* \param minExpFrequency
|
||||
* Minimum expected cell frequency. The chi^2 test does not work reliably
|
||||
* when the expected frequency in a cell is low (e.g. less than 5), because
|
||||
* normality assumptions break down in this case. Therefore, the
|
||||
* implementation will merge such low-frequency cells when they fall below
|
||||
* the threshold specified here.
|
||||
*
|
||||
* \param significanceLevel
|
||||
* The null hypothesis will be rejected when the associated
|
||||
* p-value is below the significance level specified here.
|
||||
*
|
||||
* \param numTests
|
||||
* Specifies the total number of tests that will be executed. If greater than one,
|
||||
* the Sidak correction will be applied to the significance level. This is because
|
||||
* by conducting multiple independent hypothesis tests in sequence, the probability
|
||||
* of a failure increases accordingly.
|
||||
*
|
||||
* \return
|
||||
* A pair of values containing the test result (success: \c true and failure: \c false)
|
||||
* and a descriptive string
|
||||
*/
|
||||
inline std::pair<bool, std::string> chi2_test(
|
||||
int nCells, const double *obsFrequencies, const double *expFrequencies,
|
||||
int sampleCount, double minExpFrequency, double significanceLevel, int numTests = 1) {
|
||||
|
||||
struct Cell {
|
||||
double expFrequency;
|
||||
size_t index;
|
||||
};
|
||||
|
||||
/* Sort all cells by their expected frequencies */
|
||||
std::vector<Cell> cells(nCells);
|
||||
for (size_t i=0; i<cells.size(); ++i) {
|
||||
cells[i].expFrequency = expFrequencies[i];
|
||||
cells[i].index = i;
|
||||
}
|
||||
std::sort(cells.begin(), cells.end(), [](const Cell &a, const Cell &b) {
|
||||
return a.expFrequency < b.expFrequency;
|
||||
});
|
||||
|
||||
/* Compute the Chi^2 statistic and pool cells as necessary */
|
||||
double pooledFrequencies = 0, pooledExpFrequencies = 0, chsq = 0;
|
||||
int pooledCells = 0, dof = 0;
|
||||
|
||||
std::ostringstream oss;
|
||||
for (const Cell &c : cells) {
|
||||
if (expFrequencies[c.index] < 0) {
|
||||
oss << "Encountered a negative expected number of samples ("
|
||||
<< expFrequencies[c.index]
|
||||
<< "). Rejecting the null hypothesis!" << std::endl;
|
||||
return std::make_pair(false, oss.str());
|
||||
} else if (expFrequencies[c.index] == 0) {
|
||||
if (obsFrequencies[c.index] > sampleCount * 1e-5) {
|
||||
/* Uh oh: samples in a cell that should be completely empty
|
||||
according to the probability density function. Ordinarily,
|
||||
even a single sample requires immediate rejection of the null
|
||||
hypothesis. But due to finite-precision computations and rounding
|
||||
errors, this can occasionally happen without there being an
|
||||
actual bug. Therefore, the criterion here is a bit more lenient. */
|
||||
|
||||
oss << "Encountered " << obsFrequencies[c.index] << " samples in a cell "
|
||||
<< "with expected frequency 0. Rejecting the null hypothesis!" << std::endl;
|
||||
return std::make_pair(false, oss.str());
|
||||
}
|
||||
} else if (expFrequencies[c.index] < minExpFrequency) {
|
||||
/* Pool cells with low expected frequencies */
|
||||
pooledFrequencies += obsFrequencies[c.index];
|
||||
pooledExpFrequencies += expFrequencies[c.index];
|
||||
pooledCells++;
|
||||
} else if (pooledExpFrequencies > 0 && pooledExpFrequencies < minExpFrequency) {
|
||||
/* Keep on pooling cells until a sufficiently high
|
||||
expected frequency is achieved. */
|
||||
pooledFrequencies += obsFrequencies[c.index];
|
||||
pooledExpFrequencies += expFrequencies[c.index];
|
||||
pooledCells++;
|
||||
} else {
|
||||
double diff = obsFrequencies[c.index] - expFrequencies[c.index];
|
||||
chsq += (diff*diff) / expFrequencies[c.index];
|
||||
++dof;
|
||||
}
|
||||
}
|
||||
|
||||
if (pooledExpFrequencies > 0 || pooledFrequencies > 0) {
|
||||
oss << "Pooled " << pooledCells << " to ensure sufficiently high expected "
|
||||
"cell frequencies (>" << minExpFrequency << ")" << std::endl;
|
||||
double diff = pooledFrequencies - pooledExpFrequencies;
|
||||
chsq += (diff*diff) / pooledExpFrequencies;
|
||||
++dof;
|
||||
}
|
||||
|
||||
/* All parameters are assumed to be known, so there is no
|
||||
additional DF reduction due to model parameters */
|
||||
dof -= 1;
|
||||
|
||||
if (dof <= 0) {
|
||||
oss << "The number of degrees of freedom (" << dof << ") is too low!" << std::endl;
|
||||
return std::make_pair(false, oss.str());
|
||||
}
|
||||
|
||||
oss << "Chi^2 statistic = " << chsq << " (d.o.f. = " << dof << ")" << std::endl;
|
||||
|
||||
/* Probability of obtaining a test statistic at least
|
||||
as extreme as the one observed under the assumption
|
||||
that the distributions match */
|
||||
double pval = 1 - (double) chi2_cdf(chsq, dof);
|
||||
|
||||
/* Apply the Sidak correction term, since we'll be conducting multiple independent
|
||||
hypothesis tests. This accounts for the fact that the probability of a failure
|
||||
increases quickly when several hypothesis tests are run in sequence. */
|
||||
double alpha = 1.0 - std::pow(1.0 - significanceLevel, 1.0 / numTests);
|
||||
|
||||
bool result = false;
|
||||
if (pval < alpha || !std::isfinite(pval)) {
|
||||
oss << "***** Rejected ***** the null hypothesis (p-value = " << pval << ", "
|
||||
"significance level = " << alpha << ")" << std::endl;
|
||||
} else {
|
||||
oss << "Accepted the null hypothesis (p-value = " << pval << ", "
|
||||
"significance level = " << alpha << ")" << std::endl;
|
||||
result = true;
|
||||
}
|
||||
return std::make_pair(result, oss.str());
|
||||
}
|
||||
|
||||
/// Write 2D Chi^2 frequency tables to disk in a format that is nicely plottable by Octave and MATLAB
|
||||
inline void chi2_dump(int res1, int res2, const double *obsFrequencies, const double *expFrequencies, const std::string &filename) {
|
||||
std::ofstream f(filename);
|
||||
|
||||
f << "obsFrequencies = [ ";
|
||||
for (int i=0; i<res1; ++i) {
|
||||
for (int j=0; j<res2; ++j) {
|
||||
f << obsFrequencies[i*res2+j];
|
||||
if (j+1 < res2)
|
||||
f << ", ";
|
||||
}
|
||||
if (i+1 < res1)
|
||||
f << "; ";
|
||||
}
|
||||
f << " ];" << std::endl
|
||||
<< "expFrequencies = [ ";
|
||||
for (int i=0; i<res1; ++i) {
|
||||
for (int j=0; j<res2; ++j) {
|
||||
f << expFrequencies[i*res2+j];
|
||||
if (j+1 < res2)
|
||||
f << ", ";
|
||||
}
|
||||
if (i+1 < res1)
|
||||
f << "; ";
|
||||
}
|
||||
f << " ];" << std::endl
|
||||
<< "colormap(jet);" << std::endl
|
||||
<< "clf; subplot(2,1,1);" << std::endl
|
||||
<< "imagesc(obsFrequencies);" << std::endl
|
||||
<< "title('Observed frequencies');" << std::endl
|
||||
<< "axis equal;" << std::endl
|
||||
<< "subplot(2,1,2);" << std::endl
|
||||
<< "imagesc(expFrequencies);" << std::endl
|
||||
<< "axis equal;" << std::endl
|
||||
<< "title('Expected frequencies');" << std::endl;
|
||||
f.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Peform a two-sided t-test based on the given mean, variance and reference value
|
||||
*
|
||||
* This test analyzes whether the expected value of a random variable matches a
|
||||
* certain known value. When there is significant statistical "evidence"
|
||||
* against this hypothesis, the test fails.
|
||||
*
|
||||
* This is useful in checking whether a Monte Carlo method method converges
|
||||
* against the right value. Because statistical tests are able to handle the
|
||||
* inherent noise of these methods, they can be used to construct statistical
|
||||
* test suites not unlike the traditional unit tests used in software engineering.
|
||||
*
|
||||
* \param mean
|
||||
* Estimated mean of the statistical estimator
|
||||
*
|
||||
* \param variance
|
||||
* Estimated variance of the statistical estimator
|
||||
*
|
||||
* \param sampleCount
|
||||
* Number of samples used to estimate \c mean and \c variance
|
||||
*
|
||||
* \param reference
|
||||
* A known reference value ("true mean")
|
||||
*
|
||||
* \param significanceLevel
|
||||
* The null hypothesis will be rejected when the associated
|
||||
* p-value is below the significance level specified here.
|
||||
*
|
||||
* \param numTests
|
||||
* Specifies the total number of tests that will be executed. If greater than one,
|
||||
* the Sidak correction will be applied to the significance level. This is because
|
||||
* by conducting multiple independent hypothesis tests in sequence, the probability
|
||||
* of a failure increases accordingly.
|
||||
*
|
||||
* \return
|
||||
* A pair of values containing the test result (success: \c true and failure: \c false)
|
||||
* and a descriptive string
|
||||
*/
|
||||
inline std::pair<bool, std::string>
|
||||
students_t_test(double mean, double variance, double reference,
|
||||
int sampleCount, double significanceLevel, int numTests) {
|
||||
std::ostringstream oss;
|
||||
|
||||
/* Compute the t statistic */
|
||||
double t = std::abs(mean - reference) * std::sqrt(sampleCount / std::max(variance, 1e-5));
|
||||
|
||||
/* Determine the degrees of freedom, and instantiate a matching distribution object */
|
||||
int dof = sampleCount - 1;
|
||||
|
||||
oss << "Sample mean = " << mean << " (reference value = " << reference << ")" << std::endl;
|
||||
oss << "Sample variance = " << variance << std::endl;
|
||||
oss << "t-statistic = " << t << " (d.o.f. = " << dof << ")" << std::endl;
|
||||
|
||||
/* Compute the p-value */
|
||||
double pval = 2 * (1 - students_t_cdf(t, dof));
|
||||
|
||||
/* Apply the Sidak correction term, since we'll be conducting multiple independent
|
||||
hypothesis tests. This accounts for the fact that the probability of a failure
|
||||
increases quickly when several hypothesis tests are run in sequence. */
|
||||
double alpha = 1.0 - std::pow(1.0 - significanceLevel, 1.0 / numTests);
|
||||
|
||||
bool result = false;
|
||||
if (pval < alpha) {
|
||||
oss << "***** Rejected ***** the null hypothesis (p-value = " << pval << ", "
|
||||
"significance level = " << alpha << ")" << std::endl;
|
||||
} else {
|
||||
oss << "Accepted the null hypothesis (p-value = " << pval << ", "
|
||||
"significance level = " << alpha << ")" << std::endl;
|
||||
result = true;
|
||||
}
|
||||
return std::make_pair(result, oss.str());
|
||||
}
|
||||
}; /* namespace hypothesis */
|
Reference in New Issue
Block a user