mirror of
https://github.com/python/cpython.git
synced 2025-12-31 04:23:37 +00:00
Add statistics recipe for sampling from an estimated probability density distribution (#117221)
This commit is contained in:
parent
b3e8c78ed7
commit
92397d5ead
1 changed files with 58 additions and 0 deletions
|
|
@ -1148,6 +1148,64 @@ The final prediction goes to the largest posterior. This is known as the
|
|||
'female'
|
||||
|
||||
|
||||
Sampling from kernel density estimation
|
||||
***************************************
|
||||
|
||||
The :func:`kde()` function creates a continuous probability density
|
||||
function from discrete samples. Some applications need a way to make
|
||||
random selections from that distribution.
|
||||
|
||||
The technique is to pick a sample from a bandwidth scaled kernel
|
||||
function and recenter the result around a randomly chosen point from
|
||||
the input data. This can be done with any kernel that has a known or
|
||||
accurately approximated inverse cumulative distribution function.
|
||||
|
||||
.. testcode::
|
||||
|
||||
from random import choice, random, seed
|
||||
from math import sqrt, log, pi, tan, asin
|
||||
from statistics import NormalDist
|
||||
|
||||
kernel_invcdfs = {
|
||||
'normal': NormalDist().inv_cdf,
|
||||
'logistic': lambda p: log(p / (1 - p)),
|
||||
'sigmoid': lambda p: log(tan(p * pi/2)),
|
||||
'rectangular': lambda p: 2*p - 1,
|
||||
'triangular': lambda p: sqrt(2*p) - 1 if p < 0.5 else 1 - sqrt(2 - 2*p),
|
||||
'cosine': lambda p: 2*asin(2*p - 1)/pi,
|
||||
}
|
||||
|
||||
def kde_random(data, h, kernel='normal'):
|
||||
'Return a function that samples from kde() smoothed data.'
|
||||
kernel_invcdf = kernel_invcdfs[kernel]
|
||||
def rand():
|
||||
return h * kernel_invcdf(random()) + choice(data)
|
||||
return rand
|
||||
|
||||
For example:
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> discrete_samples = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
|
||||
>>> rand = kde_random(discrete_samples, h=1.5)
|
||||
>>> seed(8675309)
|
||||
>>> selections = [rand() for i in range(10)]
|
||||
>>> [round(x, 1) for x in selections]
|
||||
[4.7, 7.4, 1.2, 7.8, 6.9, -1.3, 5.8, 0.2, -1.4, 5.7]
|
||||
|
||||
.. testcode::
|
||||
:hide:
|
||||
|
||||
from statistics import kde
|
||||
from math import isclose
|
||||
|
||||
# Verify that cdf / invcdf will round trip
|
||||
xarr = [i/100 for i in range(-100, 101)]
|
||||
for kernel, invcdf in kernel_invcdfs.items():
|
||||
cdf = kde([0.0], h=1.0, kernel=kernel, cumulative=True)
|
||||
for x in xarr:
|
||||
assert isclose(invcdf(cdf(x)), x, abs_tol=1E-9)
|
||||
|
||||
..
|
||||
# This modelines must appear within the last ten lines of the file.
|
||||
kate: indent-width 3; remove-trailing-space on; replace-tabs on; encoding utf-8;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue