bpo-39218: Improve accuracy of variance calculation (GH-27960)

2025-11-08 09:32:01 +00:00 · 2021-08-30 20:57:30 -05:00 · 2021-08-30 20:57:30 -05:00 · 793f55bde9
commit 793f55bde9
parent 044e8d866f
3 changed files with 23 additions and 14 deletions
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@ -728,15 +728,19 @@ def _ss(data, c=None):
    lead to garbage results.
    """
    if c is not None:
-        T, total, count = _sum((x-c)**2 for x in data)
+        T, total, count = _sum((d := x - c) * d for x in data)
        return (T, total)
    # Compute the mean accurate to within 1/2 ulp
    c = mean(data)
-    T, total, count = _sum((x-c)**2 for x in data)
+    # Initial computation for the sum of square deviations
-    # The following sum should mathematically equal zero, but due to rounding
+    T, total, count = _sum((d := x - c) * d for x in data)
-    # error may not.
+    # Correct any remaining inaccuracy in the mean c.
-    U, total2, count2 = _sum((x - c) for x in data)
+    # The following sum should mathematically equal zero,
-    assert T == U and count == count2
+    # but due to the final rounding of the mean, it may not.
-    total -= total2 ** 2 / len(data)
+    U, error, count2 = _sum((x - c) for x in data)
    assert count == count2
    correction = error * error / len(data)
    total -= correction
    assert not total < 0, 'negative sum of square deviations: %f' % total
    return (T, total)
@ -924,8 +928,8 @@ def correlation(x, y, /):
    xbar = fsum(x) / n
    ybar = fsum(y) / n
    sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
-    sxx = fsum((xi - xbar) ** 2.0 for xi in x)
+    sxx = fsum((d := xi - xbar) * d for xi in x)
-    syy = fsum((yi - ybar) ** 2.0 for yi in y)
+    syy = fsum((d := yi - ybar) * d for yi in y)
    try:
        return sxy / sqrt(sxx * syy)
    except ZeroDivisionError:
@ -968,7 +972,7 @@ def linear_regression(x, y, /):
    xbar = fsum(x) / n
    ybar = fsum(y) / n
    sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
-    sxx = fsum((xi - xbar) ** 2.0 for xi in x)
+    sxx = fsum((d := xi - xbar) * d for xi in x)
    try:
        slope = sxy / sxx   # equivalent to:  covariance(x, y) / variance(x)
    except ZeroDivisionError:
@ -1094,10 +1098,11 @@ def samples(self, n, *, seed=None):
    def pdf(self, x):
        "Probability density function.  P(x <= X < x+dx) / dx"
-        variance = self._sigma ** 2.0
+        variance = self._sigma * self._sigma
        if not variance:
            raise StatisticsError('pdf() not defined when sigma is zero')
-        return exp((x - self._mu)**2.0 / (-2.0*variance)) / sqrt(tau*variance)
+        diff = x - self._mu
        return exp(diff * diff / (-2.0 * variance)) / sqrt(tau * variance)
    def cdf(self, x):
        "Cumulative distribution function.  P(X <= x)"
@ -1161,7 +1166,7 @@ def overlap(self, other):
        if not dv:
            return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0)))
        a = X._mu * Y_var - Y._mu * X_var
-        b = X._sigma * Y._sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var))
+        b = X._sigma * Y._sigma * sqrt(dm * dm + dv * log(Y_var / X_var))
        x1 = (a + b) / dv
        x2 = (a - b) / dv
        return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
@ -1204,7 +1209,7 @@ def stdev(self):
    @property
    def variance(self):
        "Square of the standard deviation."
-        return self._sigma ** 2.0
+        return self._sigma * self._sigma
    def __add__(x1, x2):
        """Add a constant or another NormalDist instance.
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@ -1210,6 +1210,9 @@ def __pow__(self, other):
            def __add__(self, other):
                return type(self)(super().__add__(other))
            __radd__ = __add__
            def __mul__(self, other):
                return type(self)(super().__mul__(other))
            __rmul__ = __mul__
        return (float, Decimal, Fraction, MyFloat)
    def test_types_conserved(self):
--- a/Misc/NEWS.d/next/Library/2021-08-25-20-18-31.bpo-39218.BlO6jW.rst
+++ b/Misc/NEWS.d/next/Library/2021-08-25-20-18-31.bpo-39218.BlO6jW.rst
@ -0,0 +1 @@
 Improve accuracy of variance calculations by using x*x instead of x**2.
		`@ -0,0 +1 @@`
							`Improve accuracy of variance calculations by using xx instead of x*2.`