mirror of
https://github.com/python/cpython.git
synced 2025-11-08 09:32:01 +00:00
bpo-39218: Improve accuracy of variance calculation (GH-27960)
This commit is contained in:
parent
044e8d866f
commit
793f55bde9
3 changed files with 23 additions and 14 deletions
|
|
@ -728,15 +728,19 @@ def _ss(data, c=None):
|
||||||
lead to garbage results.
|
lead to garbage results.
|
||||||
"""
|
"""
|
||||||
if c is not None:
|
if c is not None:
|
||||||
T, total, count = _sum((x-c)**2 for x in data)
|
T, total, count = _sum((d := x - c) * d for x in data)
|
||||||
return (T, total)
|
return (T, total)
|
||||||
|
# Compute the mean accurate to within 1/2 ulp
|
||||||
c = mean(data)
|
c = mean(data)
|
||||||
T, total, count = _sum((x-c)**2 for x in data)
|
# Initial computation for the sum of square deviations
|
||||||
# The following sum should mathematically equal zero, but due to rounding
|
T, total, count = _sum((d := x - c) * d for x in data)
|
||||||
# error may not.
|
# Correct any remaining inaccuracy in the mean c.
|
||||||
U, total2, count2 = _sum((x - c) for x in data)
|
# The following sum should mathematically equal zero,
|
||||||
assert T == U and count == count2
|
# but due to the final rounding of the mean, it may not.
|
||||||
total -= total2 ** 2 / len(data)
|
U, error, count2 = _sum((x - c) for x in data)
|
||||||
|
assert count == count2
|
||||||
|
correction = error * error / len(data)
|
||||||
|
total -= correction
|
||||||
assert not total < 0, 'negative sum of square deviations: %f' % total
|
assert not total < 0, 'negative sum of square deviations: %f' % total
|
||||||
return (T, total)
|
return (T, total)
|
||||||
|
|
||||||
|
|
@ -924,8 +928,8 @@ def correlation(x, y, /):
|
||||||
xbar = fsum(x) / n
|
xbar = fsum(x) / n
|
||||||
ybar = fsum(y) / n
|
ybar = fsum(y) / n
|
||||||
sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
|
sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
|
||||||
sxx = fsum((xi - xbar) ** 2.0 for xi in x)
|
sxx = fsum((d := xi - xbar) * d for xi in x)
|
||||||
syy = fsum((yi - ybar) ** 2.0 for yi in y)
|
syy = fsum((d := yi - ybar) * d for yi in y)
|
||||||
try:
|
try:
|
||||||
return sxy / sqrt(sxx * syy)
|
return sxy / sqrt(sxx * syy)
|
||||||
except ZeroDivisionError:
|
except ZeroDivisionError:
|
||||||
|
|
@ -968,7 +972,7 @@ def linear_regression(x, y, /):
|
||||||
xbar = fsum(x) / n
|
xbar = fsum(x) / n
|
||||||
ybar = fsum(y) / n
|
ybar = fsum(y) / n
|
||||||
sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
|
sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
|
||||||
sxx = fsum((xi - xbar) ** 2.0 for xi in x)
|
sxx = fsum((d := xi - xbar) * d for xi in x)
|
||||||
try:
|
try:
|
||||||
slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x)
|
slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x)
|
||||||
except ZeroDivisionError:
|
except ZeroDivisionError:
|
||||||
|
|
@ -1094,10 +1098,11 @@ def samples(self, n, *, seed=None):
|
||||||
|
|
||||||
def pdf(self, x):
|
def pdf(self, x):
|
||||||
"Probability density function. P(x <= X < x+dx) / dx"
|
"Probability density function. P(x <= X < x+dx) / dx"
|
||||||
variance = self._sigma ** 2.0
|
variance = self._sigma * self._sigma
|
||||||
if not variance:
|
if not variance:
|
||||||
raise StatisticsError('pdf() not defined when sigma is zero')
|
raise StatisticsError('pdf() not defined when sigma is zero')
|
||||||
return exp((x - self._mu)**2.0 / (-2.0*variance)) / sqrt(tau*variance)
|
diff = x - self._mu
|
||||||
|
return exp(diff * diff / (-2.0 * variance)) / sqrt(tau * variance)
|
||||||
|
|
||||||
def cdf(self, x):
|
def cdf(self, x):
|
||||||
"Cumulative distribution function. P(X <= x)"
|
"Cumulative distribution function. P(X <= x)"
|
||||||
|
|
@ -1161,7 +1166,7 @@ def overlap(self, other):
|
||||||
if not dv:
|
if not dv:
|
||||||
return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0)))
|
return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0)))
|
||||||
a = X._mu * Y_var - Y._mu * X_var
|
a = X._mu * Y_var - Y._mu * X_var
|
||||||
b = X._sigma * Y._sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var))
|
b = X._sigma * Y._sigma * sqrt(dm * dm + dv * log(Y_var / X_var))
|
||||||
x1 = (a + b) / dv
|
x1 = (a + b) / dv
|
||||||
x2 = (a - b) / dv
|
x2 = (a - b) / dv
|
||||||
return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
|
return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
|
||||||
|
|
@ -1204,7 +1209,7 @@ def stdev(self):
|
||||||
@property
|
@property
|
||||||
def variance(self):
|
def variance(self):
|
||||||
"Square of the standard deviation."
|
"Square of the standard deviation."
|
||||||
return self._sigma ** 2.0
|
return self._sigma * self._sigma
|
||||||
|
|
||||||
def __add__(x1, x2):
|
def __add__(x1, x2):
|
||||||
"""Add a constant or another NormalDist instance.
|
"""Add a constant or another NormalDist instance.
|
||||||
|
|
|
||||||
|
|
@ -1210,6 +1210,9 @@ def __pow__(self, other):
|
||||||
def __add__(self, other):
|
def __add__(self, other):
|
||||||
return type(self)(super().__add__(other))
|
return type(self)(super().__add__(other))
|
||||||
__radd__ = __add__
|
__radd__ = __add__
|
||||||
|
def __mul__(self, other):
|
||||||
|
return type(self)(super().__mul__(other))
|
||||||
|
__rmul__ = __mul__
|
||||||
return (float, Decimal, Fraction, MyFloat)
|
return (float, Decimal, Fraction, MyFloat)
|
||||||
|
|
||||||
def test_types_conserved(self):
|
def test_types_conserved(self):
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
Improve accuracy of variance calculations by using x*x instead of x**2.
|
||||||
Loading…
Add table
Add a link
Reference in a new issue