GH-95861: Add support for Spearman's rank correlation coefficient (GH-95863)

This commit is contained in:
Raymond Hettinger 2022-08-18 13:48:27 -05:00 committed by GitHub
parent 91afe66707
commit 29c8f80760
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 117 additions and 18 deletions

View file

@ -104,7 +104,7 @@ These functions calculate statistics regarding relations between two inputs.
========================= =====================================================
:func:`covariance` Sample covariance for two variables.
:func:`correlation` Pearson's correlation coefficient for two variables.
:func:`correlation` Pearson and Spearman's correlation coefficients.
:func:`linear_regression` Slope and intercept for simple linear regression.
========================= =====================================================
@ -648,31 +648,57 @@ However, for reading convenience, most of the examples show sorted sequences.
.. versionadded:: 3.10
.. function:: correlation(x, y, /)
.. function:: correlation(x, y, /, *, method='linear')
Return the `Pearson's correlation coefficient
<https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
for two inputs. Pearson's correlation coefficient *r* takes values
between -1 and +1. It measures the strength and direction of the linear
relationship, where +1 means very strong, positive linear relationship,
-1 very strong, negative linear relationship, and 0 no linear relationship.
between -1 and +1. It measures the strength and direction of a linear
relationship.
If *method* is "ranked", computes `Spearman's rank correlation coefficient
<https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
for two inputs. The data is replaced by ranks. Ties are averaged so that
equal values receive the same rank. The resulting coefficient measures the
strength of a monotonic relationship.
Spearman's correlation coefficient is appropriate for ordinal data or for
continuous data that doesn't meet the linear proportion requirement for
Pearson's correlation coefficient.
Both inputs must be of the same length (no less than two), and need
not to be constant, otherwise :exc:`StatisticsError` is raised.
Examples:
Example with `Kepler's laws of planetary motion
<https://en.wikipedia.org/wiki/Kepler's_laws_of_planetary_motion>`_:
.. doctest::
>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
>>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
>>> correlation(x, x)
>>> # Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune
>>> orbital_period = [88, 225, 365, 687, 4331, 10_756, 30_687, 60_190] # days
>>> dist_from_sun = [58, 108, 150, 228, 778, 1_400, 2_900, 4_500] # million km
>>> # Show that a perfect monotonic relationship exists
>>> correlation(orbital_period, dist_from_sun, method='ranked')
1.0
>>> # Observe that a linear relationship is imperfect
>>> round(correlation(orbital_period, dist_from_sun), 4)
0.9882
>>> # Demonstrate Kepler's third law: There is a linear correlation
>>> # between the square of the orbital period and the cube of the
>>> # distance from the sun.
>>> period_squared = [p * p for p in orbital_period]
>>> dist_cubed = [d * d * d for d in dist_from_sun]
>>> round(correlation(period_squared, dist_cubed), 4)
1.0
>>> correlation(x, y)
-1.0
.. versionadded:: 3.10
.. versionchanged:: 3.12
Added support for Spearman's rank correlation coefficient.
.. function:: linear_regression(x, y, /, *, proportional=False)
Return the slope and intercept of `simple linear regression

View file

@ -134,11 +134,11 @@
from fractions import Fraction
from decimal import Decimal
from itertools import groupby, repeat
from itertools import count, groupby, repeat
from bisect import bisect_left, bisect_right
from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
from functools import reduce
from operator import mul
from operator import mul, itemgetter
from collections import Counter, namedtuple, defaultdict
_SQRT2 = sqrt(2.0)
@ -355,6 +355,50 @@ def _fail_neg(values, errmsg='negative value'):
raise StatisticsError(errmsg)
yield x
def _rank(data, /, *, key=None, reverse=False, ties='average') -> list[float]:
"""Rank order a dataset. The lowest value has rank 1.
Ties are averaged so that equal values receive the same rank:
>>> data = [31, 56, 31, 25, 75, 18]
>>> _rank(data)
[3.5, 5.0, 3.5, 2.0, 6.0, 1.0]
The operation is idempotent:
>>> _rank([3.5, 5.0, 3.5, 2.0, 6.0, 1.0])
[3.5, 5.0, 3.5, 2.0, 6.0, 1.0]
It is possible to rank the data in reverse order so that
the highest value has rank 1. Also, a key-function can
extract the field to be ranked:
>>> goals = [('eagles', 45), ('bears', 48), ('lions', 44)]
>>> _rank(goals, key=itemgetter(1), reverse=True)
[2.0, 1.0, 3.0]
"""
# If this function becomes public at some point, more thought
# needs to be given to the signature. A list of ints is
# plausible when ties is "min" or "max". When ties is "average",
# either list[float] or list[Fraction] is plausible.
# Default handling of ties matches scipy.stats.mstats.spearmanr.
if ties != 'average':
raise ValueError(f'Unknown tie resolution method: {ties!r}')
if key is not None:
data = map(key, data)
val_pos = sorted(zip(data, count()), reverse=reverse)
i = 0 # To rank starting at 0 instead of 1, set i = -1.
result = [0] * len(val_pos)
for _, g in groupby(val_pos, key=itemgetter(0)):
group = list(g)
size = len(group)
rank = i + (size + 1) / 2
for value, orig_pos in group:
result[orig_pos] = rank
i += size
return result
def _integer_sqrt_of_frac_rto(n: int, m: int) -> int:
"""Square root of n/m, rounded to the nearest integer using round-to-odd."""
@ -988,14 +1032,12 @@ def covariance(x, y, /):
return sxy / (n - 1)
def correlation(x, y, /):
def correlation(x, y, /, *, method='linear'):
"""Pearson's correlation coefficient
Return the Pearson's correlation coefficient for two inputs. Pearson's
correlation coefficient *r* takes values between -1 and +1. It measures the
strength and direction of the linear relationship, where +1 means very
strong, positive linear relationship, -1 very strong, negative linear
relationship, and 0 no linear relationship.
correlation coefficient *r* takes values between -1 and +1. It measures
the strength and direction of a linear relationship.
>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
>>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
@ -1004,12 +1046,25 @@ def correlation(x, y, /):
>>> correlation(x, y)
-1.0
If *method* is "ranked", computes Spearman's rank correlation coefficient
for two inputs. The data is replaced by ranks. Ties are averaged
so that equal values receive the same rank. The resulting coefficient
measures the strength of a monotonic relationship.
Spearman's rank correlation coefficient is appropriate for ordinal
data or for continuous data that doesn't meet the linear proportion
requirement for Pearson's correlation coefficient.
"""
n = len(x)
if len(y) != n:
raise StatisticsError('correlation requires that both inputs have same number of data points')
if n < 2:
raise StatisticsError('correlation requires at least two data points')
if method not in {'linear', 'ranked'}:
raise ValueError(f'Unknown method: {method!r}')
if method == 'ranked':
x = _rank(x)
y = _rank(y)
xbar = fsum(x) / n
ybar = fsum(y) / n
sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))

View file

@ -2565,6 +2565,22 @@ def test_different_scales(self):
self.assertAlmostEqual(statistics.covariance(x, y), 0.1)
def test_correlation_spearman(self):
# https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide-2.php
# Compare with:
# >>> import scipy.stats.mstats
# >>> scipy.stats.mstats.spearmanr(reading, mathematics)
# SpearmanrResult(correlation=0.6686960980480712, pvalue=0.03450954165178532)
# And Wolfram Alpha gives: 0.668696
# https://www.wolframalpha.com/input?i=SpearmanRho%5B%7B56%2C+75%2C+45%2C+71%2C+61%2C+64%2C+58%2C+80%2C+76%2C+61%7D%2C+%7B66%2C+70%2C+40%2C+60%2C+65%2C+56%2C+59%2C+77%2C+67%2C+63%7D%5D
reading = [56, 75, 45, 71, 61, 64, 58, 80, 76, 61]
mathematics = [66, 70, 40, 60, 65, 56, 59, 77, 67, 63]
self.assertAlmostEqual(statistics.correlation(reading, mathematics, method='ranked'),
0.6686960980480712)
with self.assertRaises(ValueError):
statistics.correlation(reading, mathematics, method='bad_method')
class TestLinearRegression(unittest.TestCase):
def test_constant_input_error(self):

View file

@ -0,0 +1,2 @@
Add support for computing Spearman's correlation coefficient to the existing
statistics.correlation() function.