gh-114271: Fix race in Thread.join() (#114839)

There is a race between when `Thread._tstate_lock` is released[^1] in `Thread._wait_for_tstate_lock()`
and when `Thread._stop()` asserts[^2] that it is unlocked. Consider the following execution
involving threads A, B, and C:

1. A starts.
2. B joins A, blocking on its `_tstate_lock`.
3. C joins A, blocking on its `_tstate_lock`.
4. A finishes and releases its `_tstate_lock`.
5. B acquires A's `_tstate_lock` in `_wait_for_tstate_lock()`, releases it, but is swapped
   out before calling `_stop()`.
6. C is scheduled, acquires A's `_tstate_lock` in `_wait_for_tstate_lock()` but is swapped
   out before releasing it.
7. B is scheduled, calls `_stop()`, which asserts that A's `_tstate_lock` is not held.
   However, C holds it, so the assertion fails.

The race can be reproduced[^3] by inserting sleeps at the appropriate points in
the threading code. To do so, run the `repro_join_race.py` from the linked repo.

There are two main parts to this PR:

1. `_tstate_lock` is replaced with an event that is attached to `PyThreadState`.
   The event is set by the runtime prior to the thread being cleared (in the same
   place that `_tstate_lock` was released). `Thread.join()` blocks waiting for the
   event to be set.
2. `_PyInterpreterState_WaitForThreads()` provides the ability to wait for all
   non-daemon threads to exit. To do so, an `is_daemon` predicate was added to
   `PyThreadState`. This field is set each time a thread is created. `threading._shutdown()`
   now calls into `_PyInterpreterState_WaitForThreads()` instead of waiting on
   `_tstate_lock`s.

[^1]: 441affc9e7/Lib/threading.py (L1201)
[^2]: 441affc9e7/Lib/threading.py (L1115)
[^3]: 8194653279

---------

Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
This commit is contained in:
mpage 2024-03-16 05:56:30 -07:00 committed by GitHub
parent 86bc40dd41
commit 33da0e844c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 771 additions and 643 deletions

View file

@ -408,7 +408,7 @@ def run(self):
def test_limbo_cleanup(self):
# Issue 7481: Failure to start thread should cleanup the limbo map.
def fail_new_thread(*args):
def fail_new_thread(*args, **kwargs):
raise threading.ThreadError()
_start_joinable_thread = threading._start_joinable_thread
threading._start_joinable_thread = fail_new_thread
@ -912,41 +912,6 @@ def f():
rc, out, err = assert_python_ok("-c", code)
self.assertEqual(err, b"")
def test_tstate_lock(self):
# Test an implementation detail of Thread objects.
started = _thread.allocate_lock()
finish = _thread.allocate_lock()
started.acquire()
finish.acquire()
def f():
started.release()
finish.acquire()
time.sleep(0.01)
# The tstate lock is None until the thread is started
t = threading.Thread(target=f)
self.assertIs(t._tstate_lock, None)
t.start()
started.acquire()
self.assertTrue(t.is_alive())
# The tstate lock can't be acquired when the thread is running
# (or suspended).
tstate_lock = t._tstate_lock
self.assertFalse(tstate_lock.acquire(timeout=0), False)
finish.release()
# When the thread ends, the state_lock can be successfully
# acquired.
self.assertTrue(tstate_lock.acquire(timeout=support.SHORT_TIMEOUT), False)
# But is_alive() is still True: we hold _tstate_lock now, which
# prevents is_alive() from knowing the thread's end-of-life C code
# is done.
self.assertTrue(t.is_alive())
# Let is_alive() find out the C code is done.
tstate_lock.release()
self.assertFalse(t.is_alive())
# And verify the thread disposed of _tstate_lock.
self.assertIsNone(t._tstate_lock)
t.join()
def test_repr_stopped(self):
# Verify that "stopped" shows up in repr(Thread) appropriately.
started = _thread.allocate_lock()
@ -1112,30 +1077,6 @@ def checker():
self.assertEqual(threading.getprofile(), old_profile)
self.assertEqual(sys.getprofile(), old_profile)
@cpython_only
def test_shutdown_locks(self):
for daemon in (False, True):
with self.subTest(daemon=daemon):
event = threading.Event()
thread = threading.Thread(target=event.wait, daemon=daemon)
# Thread.start() must add lock to _shutdown_locks,
# but only for non-daemon thread
thread.start()
tstate_lock = thread._tstate_lock
if not daemon:
self.assertIn(tstate_lock, threading._shutdown_locks)
else:
self.assertNotIn(tstate_lock, threading._shutdown_locks)
# unblock the thread and join it
event.set()
thread.join()
# Thread._stop() must remove tstate_lock from _shutdown_locks.
# Daemon threads must never add it to _shutdown_locks.
self.assertNotIn(tstate_lock, threading._shutdown_locks)
def test_locals_at_exit(self):
# bpo-19466: thread locals must not be deleted before destructors
# are called