diff --git a/Doc/library/exceptions.rst b/Doc/library/exceptions.rst
index 16d42c010f6..b5e3a84b455 100644
--- a/Doc/library/exceptions.rst
+++ b/Doc/library/exceptions.rst
@@ -978,6 +978,12 @@ their subgroups based on the types of the contained exceptions.
    raises a :exc:`TypeError` if any contained exception is not an
    :exc:`Exception` subclass.
 
+   .. impl-detail::
+
+      The ``excs`` parameter may be any sequence, but lists and tuples are
+      specifically processed more efficiently here. For optimal performance,
+      pass a tuple as ``excs``.
+
    .. attribute:: message
 
        The ``msg`` argument to the constructor. This is a read-only attribute.
diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst
index ffba99248cd..8a832f304b3 100644
--- a/Doc/whatsnew/3.15.rst
+++ b/Doc/whatsnew/3.15.rst
@@ -68,7 +68,7 @@ Summary -- Release highlights
 * :pep:`810`: :ref:`Explicit lazy imports for faster startup times
   <whatsnew315-pep810>`
 * :pep:`799`: :ref:`A dedicated profiling package for organizing Python
-  profiling tools <whatsnew315-sampling-profiler>`
+  profiling tools <whatsnew315-profiling-package>`
 * :pep:`686`: :ref:`Python now uses UTF-8 as the default encoding
   <whatsnew315-utf8-default>`
 * :pep:`782`: :ref:`A new PyBytesWriter C API to create a Python bytes object
@@ -170,14 +170,32 @@ imports cannot be lazy either (``lazy from __future__ import ...`` raises
 .. seealso:: :pep:`810` for the full specification and rationale.
 
 (Contributed by Pablo Galindo Salgado and Dino Viehland in :gh:`142349`.)
+.. _whatsnew315-profiling-package:
+
+:pep:`799`: A dedicated profiling package
+-----------------------------------------
+
+A new :mod:`!profiling` module has been added to organize Python's built-in
+profiling tools under a single, coherent namespace. This module contains:
+
+* :mod:`!profiling.tracing`: deterministic function-call tracing (relocated from
+  :mod:`cProfile`).
+* :mod:`!profiling.sampling`: a new statistical sampling profiler (named Tachyon).
+
+The :mod:`cProfile` module remains as an alias for backwards compatibility.
+The :mod:`profile` module is deprecated and will be removed in Python 3.17.
+
+.. seealso:: :pep:`799` for further details.
+
+(Contributed by Pablo Galindo and László Kiss Kollár in :gh:`138122`.)
 
 
 .. _whatsnew315-sampling-profiler:
 
-:pep:`799`: High frequency statistical sampling profiler
---------------------------------------------------------
+Tachyon: High frequency statistical sampling profiler
+-----------------------------------------------------
 
-A new statistical sampling profiler has been added to the new :mod:`!profiling` module as
+A new statistical sampling profiler (Tachyon) has been added as
 :mod:`!profiling.sampling`. This profiler enables low-overhead performance analysis of
 running Python processes without requiring code modification or process restart.
 
@@ -186,101 +204,64 @@ every function call, the sampling profiler periodically captures stack traces fr
 running processes.  This approach provides virtually zero overhead while achieving
 sampling rates of **up to 1,000,000 Hz**, making it the fastest sampling profiler
 available for Python (at the time of its contribution) and ideal for debugging
-performance issues in production environments.
+performance issues in production environments. This capability is particularly
+valuable for debugging performance issues in production systems where traditional
+profiling approaches would be too intrusive.
 
 Key features include:
 
 * **Zero-overhead profiling**: Attach to any running Python process without
-  affecting its performance
-* **No code modification required**: Profile existing applications without restart
-* **Real-time statistics**: Monitor sampling quality during data collection
-* **Multiple output formats**: Generate both detailed statistics and flamegraph data
-* **Thread-aware profiling**: Option to profile all threads or just the main thread
+  affecting its performance. Ideal for production debugging where you can't afford
+  to restart or slow down your application.
 
-Profile process 1234 for 10 seconds with default settings:
+* **No code modification required**: Profile existing applications without restart.
+  Simply point the profiler at a running process by PID and start collecting data.
 
-.. code-block:: shell
+* **Flexible target modes**:
 
-  python -m profiling.sampling 1234
+  * Profile running processes by PID (``attach``) - attach to already-running applications
+  * Run and profile scripts directly (``run``) - profile from the very start of execution
+  * Execute and profile modules (``run -m``) - profile packages run as ``python -m module``
 
-Profile with custom interval and duration, save to file:
+* **Multiple profiling modes**: Choose what to measure based on your performance investigation:
 
-.. code-block:: shell
+  * **Wall-clock time** (``--mode wall``, default): Measures real elapsed time including I/O,
+    network waits, and blocking operations. Use this to understand where your program spends
+    calendar time, including when waiting for external resources.
+  * **CPU time** (``--mode cpu``): Measures only active CPU execution time, excluding I/O waits
+    and blocking. Use this to identify CPU-bound bottlenecks and optimize computational work.
+  * **GIL-holding time** (``--mode gil``): Measures time spent holding Python's Global Interpreter
+    Lock. Use this to identify which threads dominate GIL usage in multi-threaded applications.
 
-  python -m profiling.sampling -i 50 -d 30 -o profile.stats 1234
+* **Thread-aware profiling**: Option to profile all threads (``-a``) or just the main thread,
+  essential for understanding multi-threaded application behavior.
 
-Generate collapsed stacks for flamegraph:
+* **Multiple output formats**: Choose the visualization that best fits your workflow:
 
-.. code-block:: shell
+  * ``--pstats``: Detailed tabular statistics compatible with :mod:`pstats`. Shows function-level
+    timing with direct and cumulative samples. Best for detailed analysis and integration with
+    existing Python profiling tools.
+  * ``--collapsed``: Generates collapsed stack traces (one line per stack). This format is
+    specifically designed for creating flamegraphs with external tools like Brendan Gregg's
+    FlameGraph scripts or speedscope.
+  * ``--flamegraph``: Generates a self-contained interactive HTML flamegraph using D3.js.
+    Opens directly in your browser for immediate visual analysis. Flamegraphs show the call
+    hierarchy where width represents time spent, making it easy to spot bottlenecks at a glance.
+  * ``--gecko``: Generates Gecko Profiler format compatible with Firefox Profiler
+    (https://profiler.firefox.com). Upload the output to Firefox Profiler for advanced
+    timeline-based analysis with features like stack charts, markers, and network activity.
+  * ``--heatmap``: Generates an interactive HTML heatmap visualization with line-level sample
+    counts. Creates a directory with per-file heatmaps showing exactly where time is spent
+    at the source code level.
 
-  python -m profiling.sampling --collapsed 1234
+* **Live interactive mode**: Real-time TUI profiler with a top-like interface (``--live``).
+  Monitor performance as your application runs with interactive sorting and filtering.
 
-Profile all threads and sort by total time:
+* **Async-aware profiling**: Profile async/await code with task-based stack reconstruction
+  (``--async-aware``). See which coroutines are consuming time, with options to show only
+  running tasks or all tasks including those waiting.
 
-.. code-block:: shell
-
-  python -m profiling.sampling -a --sort-tottime 1234
-
-The profiler generates statistical estimates of where time is spent:
-
-.. code-block:: text
-
-  Real-time sampling stats: Mean: 100261.5Hz (9.97µs) Min: 86333.4Hz (11.58µs) Max: 118807.2Hz (8.42µs) Samples: 400001
-  Captured 498841 samples in 5.00 seconds
-  Sample rate: 99768.04 samples/sec
-  Error rate: 0.72%
-  Profile Stats:
-        nsamples   sample%   tottime (s)    cumul%   cumtime (s)  filename:lineno(function)
-        43/418858       0.0         0.000      87.9         4.189  case.py:667(TestCase.run)
-      3293/418812       0.7         0.033      87.9         4.188  case.py:613(TestCase._callTestMethod)
-    158562/158562      33.3         1.586      33.3         1.586  test_compile.py:725(TestSpecifics.test_compiler_recursion_limit.<locals>.check_limit)
-    129553/129553      27.2         1.296      27.2         1.296  ast.py:46(parse)
-        0/128129       0.0         0.000      26.9         1.281  test_ast.py:884(AST_Tests.test_ast_recursion_limit.<locals>.check_limit)
-          7/67446       0.0         0.000      14.2         0.674  test_compile.py:729(TestSpecifics.test_compiler_recursion_limit)
-          6/60380       0.0         0.000      12.7         0.604  test_ast.py:888(AST_Tests.test_ast_recursion_limit)
-          3/50020       0.0         0.000      10.5         0.500  test_compile.py:727(TestSpecifics.test_compiler_recursion_limit)
-          1/38011       0.0         0.000       8.0         0.380  test_ast.py:886(AST_Tests.test_ast_recursion_limit)
-          1/25076       0.0         0.000       5.3         0.251  test_compile.py:728(TestSpecifics.test_compiler_recursion_limit)
-      22361/22362       4.7         0.224       4.7         0.224  test_compile.py:1368(TestSpecifics.test_big_dict_literal)
-          4/18008       0.0         0.000       3.8         0.180  test_ast.py:889(AST_Tests.test_ast_recursion_limit)
-        11/17696       0.0         0.000       3.7         0.177  subprocess.py:1038(Popen.__init__)
-      16968/16968       3.6         0.170       3.6         0.170  subprocess.py:1900(Popen._execute_child)
-          2/16941       0.0         0.000       3.6         0.169  test_compile.py:730(TestSpecifics.test_compiler_recursion_limit)
-
-  Legend:
-    nsamples: Direct/Cumulative samples (direct executing / on call stack)
-    sample%: Percentage of total samples this function was directly executing
-    tottime: Estimated total time spent directly in this function
-    cumul%: Percentage of total samples when this function was on the call stack
-    cumtime: Estimated cumulative time (including time in called functions)
-    filename:lineno(function): Function location and name
-
-  Summary of Interesting Functions:
-
-  Functions with Highest Direct/Cumulative Ratio (Hot Spots):
-    1.000 direct/cumulative ratio, 33.3% direct samples: test_compile.py:(TestSpecifics.test_compiler_recursion_limit.<locals>.check_limit)
-    1.000 direct/cumulative ratio, 27.2% direct samples: ast.py:(parse)
-    1.000 direct/cumulative ratio, 3.6% direct samples: subprocess.py:(Popen._execute_child)
-
-  Functions with Highest Call Frequency (Indirect Calls):
-    418815 indirect calls, 87.9% total stack presence: case.py:(TestCase.run)
-    415519 indirect calls, 87.9% total stack presence: case.py:(TestCase._callTestMethod)
-    159470 indirect calls, 33.5% total stack presence: test_compile.py:(TestSpecifics.test_compiler_recursion_limit)
-
-  Functions with Highest Call Magnification (Cumulative/Direct):
-    12267.9x call magnification, 159470 indirect calls from 13 direct: test_compile.py:(TestSpecifics.test_compiler_recursion_limit)
-    10581.7x call magnification, 116388 indirect calls from 11 direct: test_ast.py:(AST_Tests.test_ast_recursion_limit)
-    9740.9x call magnification, 418815 indirect calls from 43 direct: case.py:(TestCase.run)
-
-The profiler automatically identifies performance bottlenecks through statistical
-analysis, highlighting functions with high CPU usage and call frequency patterns.
-
-This capability is particularly valuable for debugging performance issues in
-production systems where traditional profiling approaches would be too intrusive.
-
-  .. seealso:: :pep:`799` for further details.
-
-(Contributed by Pablo Galindo and László Kiss Kollár in :gh:`135953`.)
+(Contributed by Pablo Galindo and László Kiss Kollár in :gh:`135953` and :gh:`138122`.)
 
 
 .. _whatsnew315-improved-error-messages:
diff --git a/Include/cpython/pyerrors.h b/Include/cpython/pyerrors.h
index 6b63d304b0d..be2e3b641c2 100644
--- a/Include/cpython/pyerrors.h
+++ b/Include/cpython/pyerrors.h
@@ -18,6 +18,7 @@ typedef struct {
     PyException_HEAD
     PyObject *msg;
     PyObject *excs;
+    PyObject *excs_str;
 } PyBaseExceptionGroupObject;
 
 typedef struct {
diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h
index 1e1e46ea4c0..22df26bd37a 100644
--- a/Include/cpython/pystate.h
+++ b/Include/cpython/pystate.h
@@ -135,6 +135,15 @@ struct _ts {
     /* Pointer to currently executing frame. */
     struct _PyInterpreterFrame *current_frame;
 
+    /* Pointer to the base frame (bottommost sentinel frame).
+       Used by profilers to validate complete stack unwinding.
+       Points to the embedded base_frame in _PyThreadStateImpl.
+       The frame is embedded there rather than here because _PyInterpreterFrame
+       is defined in internal headers that cannot be exposed in the public API. */
+    struct _PyInterpreterFrame *base_frame;
+
+    struct _PyInterpreterFrame *last_profiled_frame;
+
     Py_tracefunc c_profilefunc;
     Py_tracefunc c_tracefunc;
     PyObject *c_profileobj;
diff --git a/Include/internal/pycore_debug_offsets.h b/Include/internal/pycore_debug_offsets.h
index 0f17bf17f82..1cdc4449b17 100644
--- a/Include/internal/pycore_debug_offsets.h
+++ b/Include/internal/pycore_debug_offsets.h
@@ -102,6 +102,8 @@ typedef struct _Py_DebugOffsets {
         uint64_t next;
         uint64_t interp;
         uint64_t current_frame;
+        uint64_t base_frame;
+        uint64_t last_profiled_frame;
         uint64_t thread_id;
         uint64_t native_thread_id;
         uint64_t datastack_chunk;
@@ -272,6 +274,8 @@ typedef struct _Py_DebugOffsets {
         .next = offsetof(PyThreadState, next), \
         .interp = offsetof(PyThreadState, interp), \
         .current_frame = offsetof(PyThreadState, current_frame), \
+        .base_frame = offsetof(PyThreadState, base_frame), \
+        .last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \
         .thread_id = offsetof(PyThreadState, thread_id), \
         .native_thread_id = offsetof(PyThreadState, native_thread_id), \
         .datastack_chunk = offsetof(PyThreadState, datastack_chunk), \
diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h
index 0403739c015..7e77a21ecb8 100644
--- a/Include/internal/pycore_global_objects_fini_generated.h
+++ b/Include/internal/pycore_global_objects_fini_generated.h
@@ -1611,6 +1611,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_parameter_type));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_return));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_stack));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cache_frames));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cached_datetime_module));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cached_statements));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cadata));
@@ -2055,6 +2056,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stacklevel));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(statement));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stats));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(status));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stderr));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stdin));
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h
index ce94e69b28c..748502ab740 100644
--- a/Include/internal/pycore_global_strings.h
+++ b/Include/internal/pycore_global_strings.h
@@ -334,6 +334,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(c_parameter_type)
         STRUCT_FOR_ID(c_return)
         STRUCT_FOR_ID(c_stack)
+        STRUCT_FOR_ID(cache_frames)
         STRUCT_FOR_ID(cached_datetime_module)
         STRUCT_FOR_ID(cached_statements)
         STRUCT_FOR_ID(cadata)
@@ -778,6 +779,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(stacklevel)
         STRUCT_FOR_ID(start)
         STRUCT_FOR_ID(statement)
+        STRUCT_FOR_ID(stats)
         STRUCT_FOR_ID(status)
         STRUCT_FOR_ID(stderr)
         STRUCT_FOR_ID(stdin)
diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h
index af2795debae..b10d6de2963 100644
--- a/Include/internal/pycore_runtime_init_generated.h
+++ b/Include/internal/pycore_runtime_init_generated.h
@@ -1609,6 +1609,7 @@ extern "C" {
     INIT_ID(c_parameter_type), \
     INIT_ID(c_return), \
     INIT_ID(c_stack), \
+    INIT_ID(cache_frames), \
     INIT_ID(cached_datetime_module), \
     INIT_ID(cached_statements), \
     INIT_ID(cadata), \
@@ -2053,6 +2054,7 @@ extern "C" {
     INIT_ID(stacklevel), \
     INIT_ID(start), \
     INIT_ID(statement), \
+    INIT_ID(stats), \
     INIT_ID(status), \
     INIT_ID(stderr), \
     INIT_ID(stdin), \
diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h
index 50048801b2e..c4f723ac8ab 100644
--- a/Include/internal/pycore_tstate.h
+++ b/Include/internal/pycore_tstate.h
@@ -10,6 +10,7 @@ extern "C" {
 
 #include "pycore_brc.h"             // struct _brc_thread_state
 #include "pycore_freelist_state.h"  // struct _Py_freelists
+#include "pycore_interpframe_structs.h"  // _PyInterpreterFrame
 #include "pycore_mimalloc.h"        // struct _mimalloc_thread_state
 #include "pycore_qsbr.h"            // struct qsbr
 #include "pycore_uop.h"             // struct _PyUOpInstruction
@@ -61,6 +62,10 @@ typedef struct _PyThreadStateImpl {
     // semi-public fields are in PyThreadState.
     PyThreadState base;
 
+    // Embedded base frame - sentinel at the bottom of the frame stack.
+    // Used by profiling/sampling to detect incomplete stack traces.
+    _PyInterpreterFrame base_frame;
+
     // The reference count field is used to synchronize deallocation of the
     // thread state during runtime finalization.
     Py_ssize_t refcount;
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h
index 3e6ec8801b0..f61ad458e8c 100644
--- a/Include/internal/pycore_unicodeobject_generated.h
+++ b/Include/internal/pycore_unicodeobject_generated.h
@@ -1116,6 +1116,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(cache_frames);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(cached_datetime_module);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -2892,6 +2896,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(stats);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(status);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
diff --git a/InternalDocs/frames.md b/InternalDocs/frames.md
index 804d7436018..60ab2055afa 100644
--- a/InternalDocs/frames.md
+++ b/InternalDocs/frames.md
@@ -111,6 +111,55 @@ ### Shim frames
 instruction which cleans up the shim frame and returns.
 
 
+### Base frame
+
+Each thread state contains an embedded `_PyInterpreterFrame` called the "base frame"
+that serves as a sentinel at the bottom of the frame stack. This frame is allocated
+in `_PyThreadStateImpl` (the internal extension of `PyThreadState`) and initialized
+when the thread state is created. The `owner` field is set to `FRAME_OWNED_BY_INTERPRETER`.
+
+External profilers and sampling tools can validate that they have successfully unwound
+the complete call stack by checking that the frame chain terminates at the base frame.
+The `PyThreadState.base_frame` pointer provides the expected address to compare against.
+If a stack walk doesn't reach this frame, the sample is incomplete (possibly due to a
+race condition) and should be discarded.
+
+The base frame is embedded in `_PyThreadStateImpl` rather than `PyThreadState` because
+`_PyInterpreterFrame` is defined in internal headers that cannot be exposed in the
+public API. A pointer (`PyThreadState.base_frame`) is provided for profilers to access
+the address without needing internal headers.
+
+See the initialization in `new_threadstate()` in [Python/pystate.c](../Python/pystate.c).
+
+#### How profilers should use the base frame
+
+External profilers should read `tstate->base_frame` before walking the stack, then
+walk from `tstate->current_frame` following `frame->previous` pointers until reaching
+a frame with `owner == FRAME_OWNED_BY_INTERPRETER`. After the walk, verify that the
+last frame address matches `base_frame`. If not, discard the sample as incomplete
+since the frame chain may have been in an inconsistent state due to concurrent updates.
+
+
+### Remote Profiling Frame Cache
+
+The `last_profiled_frame` field in `PyThreadState` supports an optimization for
+remote profilers that sample call stacks from external processes. When a remote
+profiler reads the call stack, it writes the current frame address to this field.
+The eval loop then keeps this pointer valid by updating it to the parent frame
+whenever a frame returns (in `_PyEval_FrameClearAndPop`).
+
+This creates a "high-water mark" that always points to a frame still on the stack.
+On subsequent samples, the profiler can walk from `current_frame` until it reaches
+`last_profiled_frame`, knowing that frames from that point downward are unchanged
+and can be retrieved from a cache. This significantly reduces the amount of remote
+memory reads needed when call stacks are deep and stable at their base.
+
+The update in `_PyEval_FrameClearAndPop` is guarded: it only writes when
+`last_profiled_frame` is non-NULL AND matches the frame being popped. This
+prevents transient frames (called and returned between profiler samples) from
+corrupting the cache pointer, while avoiding any overhead when profiling is inactive.
+
+
 ### The Instruction Pointer
 
 `_PyInterpreterFrame` has two fields which are used to maintain the instruction
diff --git a/Lib/argparse.py b/Lib/argparse.py
index 07d7d77e884..398825508f5 100644
--- a/Lib/argparse.py
+++ b/Lib/argparse.py
@@ -189,6 +189,8 @@ def __init__(
         self._whitespace_matcher = _re.compile(r'\s+', _re.ASCII)
         self._long_break_matcher = _re.compile(r'\n\n\n+')
 
+        self._set_color(False)
+
     def _set_color(self, color):
         from _colorize import can_colorize, decolor, get_theme
 
@@ -334,31 +336,15 @@ def _format_usage(self, usage, actions, groups, prefix):
         elif usage is None:
             prog = '%(prog)s' % dict(prog=self._prog)
 
-            # split optionals from positionals
-            optionals = []
-            positionals = []
-            for action in actions:
-                if action.option_strings:
-                    optionals.append(action)
-                else:
-                    positionals.append(action)
-
+            parts, pos_start = self._get_actions_usage_parts(actions, groups)
             # build full usage string
-            format = self._format_actions_usage
-            action_usage = format(optionals + positionals, groups)
-            usage = ' '.join([s for s in [prog, action_usage] if s])
+            usage = ' '.join(filter(None, [prog, *parts]))
 
             # wrap the usage parts if it's too long
             text_width = self._width - self._current_indent
             if len(prefix) + len(self._decolor(usage)) > text_width:
 
                 # break usage into wrappable parts
-                # keep optionals and positionals together to preserve
-                # mutually exclusive group formatting (gh-75949)
-                all_actions = optionals + positionals
-                parts, pos_start = self._get_actions_usage_parts_with_split(
-                    all_actions, groups, len(optionals)
-                )
                 opt_parts = parts[:pos_start]
                 pos_parts = parts[pos_start:]
 
@@ -417,129 +403,114 @@ def get_lines(parts, indent, prefix=None):
         # prefix with 'usage:'
         return f'{t.usage}{prefix}{t.reset}{usage}\n\n'
 
-    def _format_actions_usage(self, actions, groups):
-        return ' '.join(self._get_actions_usage_parts(actions, groups))
-
     def _is_long_option(self, string):
         return len(string) > 2
 
     def _get_actions_usage_parts(self, actions, groups):
-        parts, _ = self._get_actions_usage_parts_with_split(actions, groups)
-        return parts
-
-    def _get_actions_usage_parts_with_split(self, actions, groups, opt_count=None):
         """Get usage parts with split index for optionals/positionals.
 
         Returns (parts, pos_start) where pos_start is the index in parts
-        where positionals begin. When opt_count is None, pos_start is None.
+        where positionals begin.
         This preserves mutually exclusive group formatting across the
         optionals/positionals boundary (gh-75949).
         """
-        # find group indices and identify actions in groups
-        group_actions = set()
-        inserts = {}
+        actions = [action for action in actions if action.help is not SUPPRESS]
+        # group actions by mutually exclusive groups
+        action_groups = dict.fromkeys(actions)
         for group in groups:
-            if not group._group_actions:
-                raise ValueError(f'empty group {group}')
-
-            if all(action.help is SUPPRESS for action in group._group_actions):
-                continue
-
-            try:
-                start = min(actions.index(item) for item in group._group_actions)
-            except ValueError:
-                continue
-            else:
-                end = start + len(group._group_actions)
-                if set(actions[start:end]) == set(group._group_actions):
-                    group_actions.update(group._group_actions)
-                    inserts[start, end] = group
+            for action in group._group_actions:
+                if action in action_groups:
+                    action_groups[action] = group
+        # positional arguments keep their position
+        positionals = []
+        for action in actions:
+            if not action.option_strings:
+                group = action_groups.pop(action)
+                if group:
+                    group_actions = [
+                        action2 for action2 in group._group_actions
+                        if action2.option_strings and
+                           action_groups.pop(action2, None)
+                    ] + [action]
+                    positionals.append((group.required, group_actions))
+                else:
+                    positionals.append((None, [action]))
+        # the remaining optional arguments are sorted by the position of
+        # the first option in the group
+        optionals = []
+        for action in actions:
+            if action.option_strings and action in action_groups:
+                group = action_groups.pop(action)
+                if group:
+                    group_actions = [action] + [
+                        action2 for action2 in group._group_actions
+                        if action2.option_strings and
+                           action_groups.pop(action2, None)
+                    ]
+                    optionals.append((group.required, group_actions))
+                else:
+                    optionals.append((None, [action]))
 
         # collect all actions format strings
         parts = []
         t = self._theme
-        for action in actions:
+        pos_start = None
+        for i, (required, group) in enumerate(optionals + positionals):
+            start = len(parts)
+            if i == len(optionals):
+                pos_start = start
+            in_group = len(group) > 1
+            for action in group:
+                # produce all arg strings
+                if not action.option_strings:
+                    default = self._get_default_metavar_for_positional(action)
+                    part = self._format_args(action, default)
+                    # if it's in a group, strip the outer []
+                    if in_group:
+                        if part[0] == '[' and part[-1] == ']':
+                            part = part[1:-1]
+                    part = t.summary_action + part + t.reset
 
-            # suppressed arguments are marked with None
-            if action.help is SUPPRESS:
-                part = None
-
-            # produce all arg strings
-            elif not action.option_strings:
-                default = self._get_default_metavar_for_positional(action)
-                part = (
-                    t.summary_action
-                    + self._format_args(action, default)
-                    + t.reset
-                )
-
-                # if it's in a group, strip the outer []
-                if action in group_actions:
-                    if part[0] == '[' and part[-1] == ']':
-                        part = part[1:-1]
-
-            # produce the first way to invoke the option in brackets
-            else:
-                option_string = action.option_strings[0]
-                if self._is_long_option(option_string):
-                    option_color = t.summary_long_option
+                # produce the first way to invoke the option in brackets
                 else:
-                    option_color = t.summary_short_option
+                    option_string = action.option_strings[0]
+                    if self._is_long_option(option_string):
+                        option_color = t.summary_long_option
+                    else:
+                        option_color = t.summary_short_option
 
-                # if the Optional doesn't take a value, format is:
-                #    -s or --long
-                if action.nargs == 0:
-                    part = action.format_usage()
-                    part = f"{option_color}{part}{t.reset}"
+                    # if the Optional doesn't take a value, format is:
+                    #    -s or --long
+                    if action.nargs == 0:
+                        part = action.format_usage()
+                        part = f"{option_color}{part}{t.reset}"
 
-                # if the Optional takes a value, format is:
-                #    -s ARGS or --long ARGS
-                else:
-                    default = self._get_default_metavar_for_optional(action)
-                    args_string = self._format_args(action, default)
-                    part = (
-                        f"{option_color}{option_string} "
-                        f"{t.summary_label}{args_string}{t.reset}"
-                    )
+                    # if the Optional takes a value, format is:
+                    #    -s ARGS or --long ARGS
+                    else:
+                        default = self._get_default_metavar_for_optional(action)
+                        args_string = self._format_args(action, default)
+                        part = (
+                            f"{option_color}{option_string} "
+                            f"{t.summary_label}{args_string}{t.reset}"
+                        )
 
-                # make it look optional if it's not required or in a group
-                if not action.required and action not in group_actions:
-                    part = '[%s]' % part
+                    # make it look optional if it's not required or in a group
+                    if not (action.required or required or in_group):
+                        part = '[%s]' % part
 
-            # add the action string to the list
-            parts.append(part)
+                # add the action string to the list
+                parts.append(part)
 
-        # group mutually exclusive actions
-        inserted_separators_indices = set()
-        for start, end in sorted(inserts, reverse=True):
-            group = inserts[start, end]
-            group_parts = [item for item in parts[start:end] if item is not None]
-            group_size = len(group_parts)
-            if group.required:
-                open, close = "()" if group_size > 1 else ("", "")
-            else:
-                open, close = "[]"
-            group_parts[0] = open + group_parts[0]
-            group_parts[-1] = group_parts[-1] + close
-            for i, part in enumerate(group_parts[:-1], start=start):
-                # insert a separator if not already done in a nested group
-                if i not in inserted_separators_indices:
-                    parts[i] = part + ' |'
-                    inserted_separators_indices.add(i)
-            parts[start + group_size - 1] = group_parts[-1]
-            for i in range(start + group_size, end):
-                parts[i] = None
+            if in_group:
+                parts[start] = ('(' if required else '[') + parts[start]
+                for i in range(start, len(parts) - 1):
+                    parts[i] += ' |'
+                parts[-1] += ')' if required else ']'
 
-        # if opt_count is provided, calculate where positionals start in
-        # the final parts list (for wrapping onto separate lines).
-        # Count before filtering None entries since indices shift after.
-        if opt_count is not None:
-            pos_start = sum(1 for p in parts[:opt_count] if p is not None)
-        else:
-            pos_start = None
-
-        # return the usage parts and split point (gh-75949)
-        return [item for item in parts if item is not None], pos_start
+        if pos_start is None:
+            pos_start = len(parts)
+        return parts, pos_start
 
     def _format_text(self, text):
         if '%(prog)' in text:
@@ -2008,14 +1979,16 @@ def add_subparsers(self, **kwargs):
             self._subparsers = self._positionals
 
         # prog defaults to the usage message of this parser, skipping
-        # optional arguments and with no "usage:" prefix
+        # non-required optional arguments and with no "usage:" prefix
         if kwargs.get('prog') is None:
             # Create formatter without color to avoid storing ANSI codes in prog
             formatter = self.formatter_class(prog=self.prog)
             formatter._set_color(False)
             positionals = self._get_positional_actions()
+            required_optionals = [action for action in self._get_optional_actions()
+                                  if action.required]
             groups = self._mutually_exclusive_groups
-            formatter.add_usage(None, positionals, groups, '')
+            formatter.add_usage(None, required_optionals + positionals, groups, '')
             kwargs['prog'] = formatter.format_help().strip()
 
         # create the parsers action and add it to the positionals list
diff --git a/Lib/asyncio/futures.py b/Lib/asyncio/futures.py
index 6bd00a64478..29652295218 100644
--- a/Lib/asyncio/futures.py
+++ b/Lib/asyncio/futures.py
@@ -389,7 +389,7 @@ def _set_state(future, other):
 
     def _call_check_cancel(destination):
         if destination.cancelled():
-            if source_loop is None or source_loop is dest_loop:
+            if source_loop is None or source_loop is events._get_running_loop():
                 source.cancel()
             else:
                 source_loop.call_soon_threadsafe(source.cancel)
@@ -398,7 +398,7 @@ def _call_set_state(source):
         if (destination.cancelled() and
                 dest_loop is not None and dest_loop.is_closed()):
             return
-        if dest_loop is None or dest_loop is source_loop:
+        if dest_loop is None or dest_loop is events._get_running_loop():
             _set_state(destination, source)
         else:
             if dest_loop.is_closed():
diff --git a/Lib/doctest.py b/Lib/doctest.py
index ad8fb900f69..0fcfa1e3e97 100644
--- a/Lib/doctest.py
+++ b/Lib/doctest.py
@@ -1167,6 +1167,32 @@ def _find_lineno(self, obj, source_lines):
                 if pat.match(source_lines[lineno]):
                     return lineno
 
+        # Handle __test__ string doctests formatted as triple-quoted
+        # strings. Find a non-blank line in the test string and match it
+        # in the source, verifying subsequent lines also match to handle
+        # duplicate lines.
+        if isinstance(obj, str) and source_lines is not None:
+            obj_lines = obj.splitlines(keepends=True)
+            # Skip the first line (may be on same line as opening quotes)
+            # and any blank lines to find a meaningful line to match.
+            start_index = 1
+            while (start_index < len(obj_lines)
+                   and not obj_lines[start_index].strip()):
+                start_index += 1
+            if start_index < len(obj_lines):
+                target_line = obj_lines[start_index]
+                for lineno, source_line in enumerate(source_lines):
+                    if source_line == target_line:
+                        # Verify subsequent lines also match
+                        for i in range(start_index + 1, len(obj_lines) - 1):
+                            source_idx = lineno + i - start_index
+                            if source_idx >= len(source_lines):
+                                break
+                            if obj_lines[i] != source_lines[source_idx]:
+                                break
+                        else:
+                            return lineno - start_index
+
         # We couldn't find the line number.
         return None
 
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index c7f665b3990..cbff9694742 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -2792,6 +2792,9 @@ def _steal_trailing_WSP_if_exists(lines):
     if lines and lines[-1] and lines[-1][-1] in WSP:
         wsp = lines[-1][-1]
         lines[-1] = lines[-1][:-1]
+        # gh-142006: if the line is now empty, remove it entirely.
+        if not lines[-1]:
+            lines.pop()
     return wsp
 
 def _refold_parse_tree(parse_tree, *, policy):
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py
index 6479b9bab7a..ae8ef32792b 100644
--- a/Lib/email/feedparser.py
+++ b/Lib/email/feedparser.py
@@ -504,10 +504,9 @@ def _parse_headers(self, lines):
                     self._input.unreadline(line)
                     return
                 else:
-                    # Weirdly placed unix-from line.  Note this as a defect
-                    # and ignore it.
+                    # Weirdly placed unix-from line.
                     defect = errors.MisplacedEnvelopeHeaderDefect(line)
-                    self._cur.defects.append(defect)
+                    self.policy.handle_defect(self._cur, defect)
                     continue
             # Split the line on the colon separating field name from value.
             # There will always be a colon, because if there wasn't the part of
@@ -519,7 +518,7 @@ def _parse_headers(self, lines):
             # message. Track the error but keep going.
             if i == 0:
                 defect = errors.InvalidHeaderDefect("Missing header name.")
-                self._cur.defects.append(defect)
+                self.policy.handle_defect(self._cur, defect)
                 continue
 
             assert i>0, "_parse_headers fed line with no : and no leading WS"
diff --git a/Lib/profiling/sampling/_heatmap_assets/heatmap.css b/Lib/profiling/sampling/_heatmap_assets/heatmap.css
index 44915b2a2da..ada6d2f2ee1 100644
--- a/Lib/profiling/sampling/_heatmap_assets/heatmap.css
+++ b/Lib/profiling/sampling/_heatmap_assets/heatmap.css
@@ -1094,18 +1094,34 @@ #scroll_marker .marker {
 }
 
 #scroll_marker .marker.cold {
+  background: var(--heat-1);
+}
+
+#scroll_marker .marker.cool {
   background: var(--heat-2);
 }
 
+#scroll_marker .marker.mild {
+  background: var(--heat-3);
+}
+
 #scroll_marker .marker.warm {
-  background: var(--heat-5);
+  background: var(--heat-4);
 }
 
 #scroll_marker .marker.hot {
+  background: var(--heat-5);
+}
+
+#scroll_marker .marker.very-hot {
+  background: var(--heat-6);
+}
+
+#scroll_marker .marker.intense {
   background: var(--heat-7);
 }
 
-#scroll_marker .marker.vhot {
+#scroll_marker .marker.extreme {
   background: var(--heat-8);
 }
 
diff --git a/Lib/profiling/sampling/_heatmap_assets/heatmap.js b/Lib/profiling/sampling/_heatmap_assets/heatmap.js
index ccf82386363..5a7ff5dd61a 100644
--- a/Lib/profiling/sampling/_heatmap_assets/heatmap.js
+++ b/Lib/profiling/sampling/_heatmap_assets/heatmap.js
@@ -26,6 +26,7 @@ function toggleTheme() {
     if (btn) {
         btn.innerHTML = next === 'dark' ? '&#9788;' : '&#9790;';  // sun or moon
     }
+    applyLineColors();
 
     // Rebuild scroll marker with new theme colors
     buildScrollMarker();
@@ -160,13 +161,6 @@ function getSampleCount(line) {
     return parseInt(text) || 0;
 }
 
-function getIntensityClass(ratio) {
-    if (ratio > 0.75) return 'vhot';
-    if (ratio > 0.5) return 'hot';
-    if (ratio > 0.25) return 'warm';
-    return 'cold';
-}
-
 // ============================================================================
 // Scroll Minimap
 // ============================================================================
@@ -194,7 +188,7 @@ function buildScrollMarker() {
 
         const lineTop = Math.floor(line.offsetTop * markerScale);
         const lineNumber = index + 1;
-        const intensityClass = maxSamples > 0 ? getIntensityClass(samples / maxSamples) : 'cold';
+        const intensityClass = maxSamples > 0 ? (intensityToClass(samples / maxSamples) || 'cold') : 'cold';
 
         if (lineNumber === prevLine + 1 && lastMark?.classList.contains(intensityClass)) {
             lastMark.style.height = `${lineTop + lineHeight - lastTop}px`;
@@ -212,6 +206,21 @@ function buildScrollMarker() {
     document.body.appendChild(scrollMarker);
 }
 
+function applyLineColors() {
+    const lines = document.querySelectorAll('.code-line');
+    lines.forEach(line => {
+        let intensity;
+        if (colorMode === 'self') {
+            intensity = parseFloat(line.getAttribute('data-self-intensity')) || 0;
+        } else {
+            intensity = parseFloat(line.getAttribute('data-cumulative-intensity')) || 0;
+        }
+
+        const color = intensityToColor(intensity);
+        line.style.background = color;
+    });
+}
+
 // ============================================================================
 // Toggle Controls
 // ============================================================================
@@ -264,20 +273,7 @@ function applyHotFilter() {
 
 function toggleColorMode() {
     colorMode = colorMode === 'self' ? 'cumulative' : 'self';
-    const lines = document.querySelectorAll('.code-line');
-
-    lines.forEach(line => {
-        let bgColor;
-        if (colorMode === 'self') {
-            bgColor = line.getAttribute('data-self-color');
-        } else {
-            bgColor = line.getAttribute('data-cumulative-color');
-        }
-
-        if (bgColor) {
-            line.style.background = bgColor;
-        }
-    });
+    applyLineColors();
 
     updateToggleUI('toggle-color-mode', colorMode === 'cumulative');
 
@@ -295,14 +291,7 @@ function toggleColorMode() {
 document.addEventListener('DOMContentLoaded', function() {
     // Restore UI state (theme, etc.)
     restoreUIState();
-
-    // Apply background colors
-    document.querySelectorAll('.code-line[data-bg-color]').forEach(line => {
-        const bgColor = line.getAttribute('data-bg-color');
-        if (bgColor) {
-            line.style.background = bgColor;
-        }
-    });
+    applyLineColors();
 
     // Initialize navigation buttons
     document.querySelectorAll('.nav-btn').forEach(button => {
diff --git a/Lib/profiling/sampling/_heatmap_assets/heatmap_index.js b/Lib/profiling/sampling/_heatmap_assets/heatmap_index.js
index 5f3e65c3310..4ddacca5173 100644
--- a/Lib/profiling/sampling/_heatmap_assets/heatmap_index.js
+++ b/Lib/profiling/sampling/_heatmap_assets/heatmap_index.js
@@ -1,6 +1,19 @@
 // Tachyon Profiler - Heatmap Index JavaScript
 // Index page specific functionality
 
+// ============================================================================
+// Heatmap Bar Coloring
+// ============================================================================
+
+function applyHeatmapBarColors() {
+    const bars = document.querySelectorAll('.heatmap-bar[data-intensity]');
+    bars.forEach(bar => {
+        const intensity = parseFloat(bar.getAttribute('data-intensity')) || 0;
+        const color = intensityToColor(intensity);
+        bar.style.backgroundColor = color;
+    });
+}
+
 // ============================================================================
 // Theme Support
 // ============================================================================
@@ -17,6 +30,8 @@ function toggleTheme() {
     if (btn) {
         btn.innerHTML = next === 'dark' ? '&#9788;' : '&#9790;';  // sun or moon
     }
+
+    applyHeatmapBarColors();
 }
 
 function restoreUIState() {
@@ -108,4 +123,5 @@ function collapseAll() {
 
 document.addEventListener('DOMContentLoaded', function() {
     restoreUIState();
+    applyHeatmapBarColors();
 });
diff --git a/Lib/profiling/sampling/_heatmap_assets/heatmap_shared.js b/Lib/profiling/sampling/_heatmap_assets/heatmap_shared.js
new file mode 100644
index 00000000000..f44ebcff4ff
--- /dev/null
+++ b/Lib/profiling/sampling/_heatmap_assets/heatmap_shared.js
@@ -0,0 +1,40 @@
+// Tachyon Profiler - Shared Heatmap JavaScript
+// Common utilities shared between index and file views
+
+// ============================================================================
+// Heat Level Mapping (Single source of truth for intensity thresholds)
+// ============================================================================
+
+// Maps intensity (0-1) to heat level (0-8). Level 0 = no heat, 1-8 = heat levels.
+function intensityToHeatLevel(intensity) {
+    if (intensity <= 0) return 0;
+    if (intensity <= 0.125) return 1;
+    if (intensity <= 0.25) return 2;
+    if (intensity <= 0.375) return 3;
+    if (intensity <= 0.5) return 4;
+    if (intensity <= 0.625) return 5;
+    if (intensity <= 0.75) return 6;
+    if (intensity <= 0.875) return 7;
+    return 8;
+}
+
+// Class names corresponding to heat levels 1-8 (used by scroll marker)
+const HEAT_CLASS_NAMES = ['cold', 'cool', 'mild', 'warm', 'hot', 'very-hot', 'intense', 'extreme'];
+
+function intensityToClass(intensity) {
+    const level = intensityToHeatLevel(intensity);
+    return level === 0 ? null : HEAT_CLASS_NAMES[level - 1];
+}
+
+// ============================================================================
+// Color Mapping (Intensity to Heat Color)
+// ============================================================================
+
+function intensityToColor(intensity) {
+    const level = intensityToHeatLevel(intensity);
+    if (level === 0) {
+        return 'transparent';
+    }
+    const rootStyle = getComputedStyle(document.documentElement);
+    return rootStyle.getPropertyValue(`--heat-${level}`).trim();
+}
diff --git a/Lib/profiling/sampling/_shared_assets/base.css b/Lib/profiling/sampling/_shared_assets/base.css
index 20516913496..d9223a98c0f 100644
--- a/Lib/profiling/sampling/_shared_assets/base.css
+++ b/Lib/profiling/sampling/_shared_assets/base.css
@@ -57,9 +57,9 @@ :root, [data-theme="light"] {
   --header-gradient: linear-gradient(135deg, #3776ab 0%, #4584bb 100%);
 
   /* Light mode heat palette - blue to yellow to orange to red (cold to hot) */
-  --heat-1: #d6e9f8;
+  --heat-1: #7ba3d1;
   --heat-2: #a8d0ef;
-  --heat-3: #7ba3d1;
+  --heat-3: #d6e9f8;
   --heat-4: #ffe6a8;
   --heat-5: #ffd43b;
   --heat-6: #ffb84d;
@@ -104,11 +104,11 @@ [data-theme="dark"] {
   --header-gradient: linear-gradient(135deg, #21262d 0%, #30363d 100%);
 
   /* Dark mode heat palette - dark blue to teal to yellow to orange (cold to hot) */
-  --heat-1: #1e3a5f;
-  --heat-2: #2d5580;
-  --heat-3: #4a7ba7;
-  --heat-4: #5a9fa8;
-  --heat-5: #7ec488;
+  --heat-1: #4a7ba7;
+  --heat-2: #5a9fa8;
+  --heat-3: #6ab5b5;
+  --heat-4: #7ec488;
+  --heat-5: #a0d878;
   --heat-6: #c4de6a;
   --heat-7: #f4d44d;
   --heat-8: #ff6b35;
diff --git a/Lib/profiling/sampling/cli.py b/Lib/profiling/sampling/cli.py
index 5c0e39d7737..0a082c0c638 100644
--- a/Lib/profiling/sampling/cli.py
+++ b/Lib/profiling/sampling/cli.py
@@ -195,6 +195,11 @@ def _add_sampling_options(parser):
         dest="gc",
         help='Don\'t include artificial "<GC>" frames to denote active garbage collection',
     )
+    sampling_group.add_argument(
+        "--async-aware",
+        action="store_true",
+        help="Enable async-aware profiling (uses task-based stack reconstruction)",
+    )
 
 
 def _add_mode_options(parser):
@@ -205,7 +210,14 @@ def _add_mode_options(parser):
         choices=["wall", "cpu", "gil"],
         default="wall",
         help="Sampling mode: wall (all samples), cpu (only samples when thread is on CPU), "
-        "gil (only samples when thread holds the GIL)",
+        "gil (only samples when thread holds the GIL). Incompatible with --async-aware",
+    )
+    mode_group.add_argument(
+        "--async-mode",
+        choices=["running", "all"],
+        default="running",
+        help='Async profiling mode: "running" (only running task) '
+        'or "all" (all tasks including waiting). Requires --async-aware',
     )
 
 
@@ -382,6 +394,27 @@ def _validate_args(args, parser):
             "Live mode requires the curses module, which is not available."
         )
 
+    # Async-aware mode is incompatible with --native, --no-gc, --mode, and --all-threads
+    if args.async_aware:
+        issues = []
+        if args.native:
+            issues.append("--native")
+        if not args.gc:
+            issues.append("--no-gc")
+        if hasattr(args, 'mode') and args.mode != "wall":
+            issues.append(f"--mode={args.mode}")
+        if hasattr(args, 'all_threads') and args.all_threads:
+            issues.append("--all-threads")
+        if issues:
+            parser.error(
+                f"Options {', '.join(issues)} are incompatible with --async-aware. "
+                "Async-aware profiling uses task-based stack reconstruction."
+            )
+
+    # --async-mode requires --async-aware
+    if hasattr(args, 'async_mode') and args.async_mode != "running" and not args.async_aware:
+        parser.error("--async-mode requires --async-aware to be enabled.")
+
     # Live mode is incompatible with format options
     if hasattr(args, 'live') and args.live:
         if args.format != "pstats":
@@ -570,6 +603,7 @@ def _handle_attach(args):
         all_threads=args.all_threads,
         realtime_stats=args.realtime_stats,
         mode=mode,
+        async_aware=args.async_mode if args.async_aware else None,
         native=args.native,
         gc=args.gc,
     )
@@ -618,6 +652,7 @@ def _handle_run(args):
             all_threads=args.all_threads,
             realtime_stats=args.realtime_stats,
             mode=mode,
+            async_aware=args.async_mode if args.async_aware else None,
             native=args.native,
             gc=args.gc,
         )
@@ -650,6 +685,7 @@ def _handle_live_attach(args, pid):
         limit=20,  # Default limit
         pid=pid,
         mode=mode,
+        async_aware=args.async_mode if args.async_aware else None,
     )
 
     # Sample in live mode
@@ -660,6 +696,7 @@ def _handle_live_attach(args, pid):
         all_threads=args.all_threads,
         realtime_stats=args.realtime_stats,
         mode=mode,
+        async_aware=args.async_mode if args.async_aware else None,
         native=args.native,
         gc=args.gc,
     )
@@ -689,6 +726,7 @@ def _handle_live_run(args):
         limit=20,  # Default limit
         pid=process.pid,
         mode=mode,
+        async_aware=args.async_mode if args.async_aware else None,
     )
 
     # Profile the subprocess in live mode
@@ -700,6 +738,7 @@ def _handle_live_run(args):
             all_threads=args.all_threads,
             realtime_stats=args.realtime_stats,
             mode=mode,
+            async_aware=args.async_mode if args.async_aware else None,
             native=args.native,
             gc=args.gc,
         )
diff --git a/Lib/profiling/sampling/collector.py b/Lib/profiling/sampling/collector.py
index 6187f351cb5..f63ea0afd8a 100644
--- a/Lib/profiling/sampling/collector.py
+++ b/Lib/profiling/sampling/collector.py
@@ -2,10 +2,16 @@
 from .constants import (
     THREAD_STATUS_HAS_GIL,
     THREAD_STATUS_ON_CPU,
-    THREAD_STATUS_UNKNOWN,
     THREAD_STATUS_GIL_REQUESTED,
+    THREAD_STATUS_UNKNOWN,
 )
 
+try:
+    from _remote_debugging import FrameInfo
+except ImportError:
+    # Fallback definition if _remote_debugging is not available
+    FrameInfo = None
+
 class Collector(ABC):
     @abstractmethod
     def collect(self, stack_frames):
@@ -33,6 +39,95 @@ def _iter_all_frames(self, stack_frames, skip_idle=False):
                 if frames:
                     yield frames, thread_info.thread_id
 
+    def _iter_async_frames(self, awaited_info_list):
+        # Phase 1: Index tasks and build parent relationships with pre-computed selection
+        task_map, child_to_parent, all_task_ids, all_parent_ids = self._build_task_graph(awaited_info_list)
+
+        # Phase 2: Find leaf tasks (tasks not awaited by anyone)
+        leaf_task_ids = self._find_leaf_tasks(all_task_ids, all_parent_ids)
+
+        # Phase 3: Build linear stacks from each leaf to root (optimized - no sorting!)
+        yield from self._build_linear_stacks(leaf_task_ids, task_map, child_to_parent)
+
+    def _build_task_graph(self, awaited_info_list):
+        task_map = {}
+        child_to_parent = {}  # Maps child_id -> (selected_parent_id, parent_count)
+        all_task_ids = set()
+        all_parent_ids = set()  # Track ALL parent IDs for leaf detection
+
+        for awaited_info in awaited_info_list:
+            thread_id = awaited_info.thread_id
+            for task_info in awaited_info.awaited_by:
+                task_id = task_info.task_id
+                task_map[task_id] = (task_info, thread_id)
+                all_task_ids.add(task_id)
+
+                # Pre-compute selected parent and count for optimization
+                if task_info.awaited_by:
+                    parent_ids = [p.task_name for p in task_info.awaited_by]
+                    parent_count = len(parent_ids)
+                    # Track ALL parents for leaf detection
+                    all_parent_ids.update(parent_ids)
+                    # Use min() for O(n) instead of sorted()[0] which is O(n log n)
+                    selected_parent = min(parent_ids) if parent_count > 1 else parent_ids[0]
+                    child_to_parent[task_id] = (selected_parent, parent_count)
+
+        return task_map, child_to_parent, all_task_ids, all_parent_ids
+
+    def _find_leaf_tasks(self, all_task_ids, all_parent_ids):
+        # Leaves are tasks that are not parents of any other task
+        return all_task_ids - all_parent_ids
+
+    def _build_linear_stacks(self, leaf_task_ids, task_map, child_to_parent):
+        for leaf_id in leaf_task_ids:
+            frames = []
+            visited = set()
+            current_id = leaf_id
+            thread_id = None
+
+            # Follow the single parent chain from leaf to root
+            while current_id is not None:
+                # Cycle detection
+                if current_id in visited:
+                    break
+                visited.add(current_id)
+
+                # Check if task exists in task_map
+                if current_id not in task_map:
+                    break
+
+                task_info, tid = task_map[current_id]
+
+                # Set thread_id from first task
+                if thread_id is None:
+                    thread_id = tid
+
+                # Add all frames from all coroutines in this task
+                if task_info.coroutine_stack:
+                    for coro_info in task_info.coroutine_stack:
+                        for frame in coro_info.call_stack:
+                            frames.append(frame)
+
+                # Get pre-computed parent info (no sorting needed!)
+                parent_info = child_to_parent.get(current_id)
+
+                # Add task boundary marker with parent count annotation if multiple parents
+                task_name = task_info.task_name or "Task-" + str(task_info.task_id)
+                if parent_info:
+                    selected_parent, parent_count = parent_info
+                    if parent_count > 1:
+                        task_name = f"{task_name} ({parent_count} parents)"
+                    frames.append(FrameInfo(("<task>", 0, task_name)))
+                    current_id = selected_parent
+                else:
+                    # Root task - no parent
+                    frames.append(FrameInfo(("<task>", 0, task_name)))
+                    current_id = None
+
+            # Yield the complete stack if we collected any frames
+            if frames and thread_id is not None:
+                yield frames, thread_id, leaf_id
+
     def _is_gc_frame(self, frame):
         if isinstance(frame, tuple):
             funcname = frame[2] if len(frame) >= 3 else ""
diff --git a/Lib/profiling/sampling/heatmap_collector.py b/Lib/profiling/sampling/heatmap_collector.py
index eb128aba9b1..8a8ba9628df 100644
--- a/Lib/profiling/sampling/heatmap_collector.py
+++ b/Lib/profiling/sampling/heatmap_collector.py
@@ -5,6 +5,7 @@
 import html
 import importlib.resources
 import json
+import math
 import os
 import platform
 import site
@@ -44,31 +45,6 @@ class TreeNode:
     children: Dict[str, 'TreeNode'] = field(default_factory=dict)
 
 
-@dataclass
-class ColorGradient:
-    """Configuration for heatmap color gradient calculations."""
-    # Color stops thresholds
-    stop_1: float = 0.2  # Blue to cyan transition
-    stop_2: float = 0.4  # Cyan to green transition
-    stop_3: float = 0.6  # Green to yellow transition
-    stop_4: float = 0.8  # Yellow to orange transition
-    stop_5: float = 1.0  # Orange to red transition
-
-    # Alpha (opacity) values
-    alpha_very_cold: float = 0.3
-    alpha_cold: float = 0.4
-    alpha_medium: float = 0.5
-    alpha_warm: float = 0.6
-    alpha_hot_base: float = 0.7
-    alpha_hot_range: float = 0.15
-
-    # Gradient multiplier
-    multiplier: int = 5
-
-    # Cache for calculated colors
-    cache: Dict[float, Tuple[int, int, int, float]] = field(default_factory=dict)
-
-
 # ============================================================================
 # Module Path Analysis
 # ============================================================================
@@ -224,8 +200,9 @@ def _load_templates(self):
             self.file_css = css_content
 
             # Load JS
-            self.index_js = (assets_dir / "heatmap_index.js").read_text(encoding="utf-8")
-            self.file_js = (assets_dir / "heatmap.js").read_text(encoding="utf-8")
+            shared_js = (assets_dir / "heatmap_shared.js").read_text(encoding="utf-8")
+            self.index_js = f"{shared_js}\n{(assets_dir / 'heatmap_index.js').read_text(encoding='utf-8')}"
+            self.file_js = f"{shared_js}\n{(assets_dir / 'heatmap.js').read_text(encoding='utf-8')}"
 
             # Load Python logo
             logo_dir = template_dir / "_assets"
@@ -321,18 +298,13 @@ def _calculate_node_stats(node: TreeNode) -> Tuple[int, int]:
 class _HtmlRenderer:
     """Renders hierarchical tree structures as HTML."""
 
-    def __init__(self, file_index: Dict[str, str], color_gradient: ColorGradient,
-                 calculate_intensity_color_func):
-        """Initialize renderer with file index and color calculation function.
+    def __init__(self, file_index: Dict[str, str]):
+        """Initialize renderer with file index.
 
         Args:
             file_index: Mapping from filenames to HTML file names
-            color_gradient: ColorGradient configuration
-            calculate_intensity_color_func: Function to calculate colors
         """
         self.file_index = file_index
-        self.color_gradient = color_gradient
-        self.calculate_intensity_color = calculate_intensity_color_func
         self.heatmap_bar_height = 16
 
     def render_hierarchical_html(self, trees: Dict[str, TreeNode]) -> str:
@@ -450,8 +422,6 @@ def _render_file_item(self, stat: FileStats, indent: str = '') -> str:
         module_name = html.escape(stat.module_name)
 
         intensity = stat.percentage / 100.0
-        r, g, b, alpha = self.calculate_intensity_color(intensity)
-        bg_color = f"rgba({r}, {g}, {b}, {alpha})"
         bar_width = min(stat.percentage, 100)
 
         html_file = self.file_index[stat.filename]
@@ -459,7 +429,7 @@ def _render_file_item(self, stat: FileStats, indent: str = '') -> str:
         return (f'{indent}<div class="file-item">\n'
                 f'{indent}  <a href="{html_file}" class="file-link" title="{full_path}">📄 {module_name}</a>\n'
                 f'{indent}  <span class="file-samples">{stat.total_samples:,} samples</span>\n'
-                f'{indent}  <div class="heatmap-bar-container"><div class="heatmap-bar" style="width: {bar_width}px; background-color: {bg_color}; height: {self.heatmap_bar_height}px;"></div></div>\n'
+                f'{indent}  <div class="heatmap-bar-container"><div class="heatmap-bar" style="width: {bar_width}px; height: {self.heatmap_bar_height}px;" data-intensity="{intensity:.3f}"></div></div>\n'
                 f'{indent}</div>\n')
 
 
@@ -501,20 +471,12 @@ def __init__(self, *args, **kwargs):
         self._path_info = get_python_path_info()
         self.stats = {}
 
-        # Color gradient configuration
-        self._color_gradient = ColorGradient()
-
         # Template loader (loads all templates once)
         self._template_loader = _TemplateLoader()
 
         # File index (populated during export)
         self.file_index = {}
 
-    @property
-    def _color_cache(self):
-        """Compatibility property for accessing color cache."""
-        return self._color_gradient.cache
-
     def set_stats(self, sample_interval_usec, duration_sec, sample_rate, error_rate=None, missed_samples=None, **kwargs):
         """Set profiling statistics to include in heatmap output.
 
@@ -746,8 +708,7 @@ def _generate_index_html(self, index_path: Path, file_stats: List[FileStats]):
         tree = _TreeBuilder.build_file_tree(file_stats)
 
         # Render tree as HTML
-        renderer = _HtmlRenderer(self.file_index, self._color_gradient,
-                                self._calculate_intensity_color)
+        renderer = _HtmlRenderer(self.file_index)
         sections_html = renderer.render_hierarchical_html(tree)
 
         # Format error rate and missed samples with bar classes
@@ -809,56 +770,6 @@ def _generate_index_html(self, index_path: Path, file_stats: List[FileStats]):
         except (IOError, OSError) as e:
             raise RuntimeError(f"Failed to write index file {index_path}: {e}") from e
 
-    def _calculate_intensity_color(self, intensity: float) -> Tuple[int, int, int, float]:
-        """Calculate RGB color and alpha for given intensity (0-1 range).
-
-        Returns (r, g, b, alpha) tuple representing the heatmap color gradient:
-        blue -> green -> yellow -> orange -> red
-
-        Results are cached to improve performance.
-        """
-        # Round to 3 decimal places for cache key
-        cache_key = round(intensity, 3)
-        if cache_key in self._color_gradient.cache:
-            return self._color_gradient.cache[cache_key]
-
-        gradient = self._color_gradient
-        m = gradient.multiplier
-
-        # Color stops with (threshold, rgb_func, alpha_func)
-        stops = [
-            (gradient.stop_1,
-             lambda i: (0, int(150 * i * m), 255),
-             lambda i: gradient.alpha_very_cold),
-            (gradient.stop_2,
-             lambda i: (0, 255, int(255 * (1 - (i - gradient.stop_1) * m))),
-             lambda i: gradient.alpha_cold),
-            (gradient.stop_3,
-             lambda i: (int(255 * (i - gradient.stop_2) * m), 255, 0),
-             lambda i: gradient.alpha_medium),
-            (gradient.stop_4,
-             lambda i: (255, int(200 - 100 * (i - gradient.stop_3) * m), 0),
-             lambda i: gradient.alpha_warm),
-            (gradient.stop_5,
-             lambda i: (255, int(100 * (1 - (i - gradient.stop_4) * m)), 0),
-             lambda i: gradient.alpha_hot_base + gradient.alpha_hot_range * (i - gradient.stop_4) * m),
-        ]
-
-        result = None
-        for threshold, rgb_func, alpha_func in stops:
-            if intensity < threshold or threshold == gradient.stop_5:
-                r, g, b = rgb_func(intensity)
-                result = (r, g, b, alpha_func(intensity))
-                break
-
-        # Fallback
-        if result is None:
-            result = (255, 0, 0, 0.75)
-
-        # Cache the result
-        self._color_gradient.cache[cache_key] = result
-        return result
-
     def _generate_file_html(self, output_path: Path, filename: str,
                           line_counts: Dict[int, int], self_counts: Dict[int, int],
                           file_stat: FileStats):
@@ -913,25 +824,23 @@ def _build_line_html(self, line_num: int, line_content: str,
 
         # Calculate colors for both self and cumulative modes
         if cumulative_samples > 0:
-            cumulative_intensity = cumulative_samples / max_samples if max_samples > 0 else 0
-            self_intensity = self_samples / max_self_samples if max_self_samples > 0 and self_samples > 0 else 0
+            log_cumulative = math.log(cumulative_samples + 1)
+            log_max = math.log(max_samples + 1)
+            cumulative_intensity = log_cumulative / log_max if log_max > 0 else 0
 
-            # Default to self-based coloring
-            intensity = self_intensity if self_samples > 0 else cumulative_intensity
-            r, g, b, alpha = self._calculate_intensity_color(intensity)
-            bg_color = f"rgba({r}, {g}, {b}, {alpha})"
-
-            # Pre-calculate colors for both modes (for JS toggle)
-            self_bg_color = self._format_color_for_intensity(self_intensity) if self_samples > 0 else "transparent"
-            cumulative_bg_color = self._format_color_for_intensity(cumulative_intensity)
+            if self_samples > 0 and max_self_samples > 0:
+                log_self = math.log(self_samples + 1)
+                log_max_self = math.log(max_self_samples + 1)
+                self_intensity = log_self / log_max_self if log_max_self > 0 else 0
+            else:
+                self_intensity = 0
 
             self_display = f"{self_samples:,}" if self_samples > 0 else ""
             cumulative_display = f"{cumulative_samples:,}"
             tooltip = f"Self: {self_samples:,}, Total: {cumulative_samples:,}"
         else:
-            bg_color = "transparent"
-            self_bg_color = "transparent"
-            cumulative_bg_color = "transparent"
+            cumulative_intensity = 0
+            self_intensity = 0
             self_display = ""
             cumulative_display = ""
             tooltip = ""
@@ -939,13 +848,14 @@ def _build_line_html(self, line_num: int, line_content: str,
         # Get navigation buttons
         nav_buttons_html = self._build_navigation_buttons(filename, line_num)
 
-        # Build line HTML
+        # Build line HTML with intensity data attributes
         line_html = html.escape(line_content.rstrip('\n'))
         title_attr = f' title="{html.escape(tooltip)}"' if tooltip else ""
 
         return (
-            f'        <div class="code-line" data-bg-color="{bg_color}" '
-            f'data-self-color="{self_bg_color}" data-cumulative-color="{cumulative_bg_color}" '
+            f'        <div class="code-line" '
+            f'data-self-intensity="{self_intensity:.3f}" '
+            f'data-cumulative-intensity="{cumulative_intensity:.3f}" '
             f'id="line-{line_num}"{title_attr}>\n'
             f'            <div class="line-number">{line_num}</div>\n'
             f'            <div class="line-samples-self">{self_display}</div>\n'
@@ -955,11 +865,6 @@ def _build_line_html(self, line_num: int, line_content: str,
             f'        </div>\n'
         )
 
-    def _format_color_for_intensity(self, intensity: float) -> str:
-        """Format color as rgba() string for given intensity."""
-        r, g, b, alpha = self._calculate_intensity_color(intensity)
-        return f"rgba({r}, {g}, {b}, {alpha})"
-
     def _build_navigation_buttons(self, filename: str, line_num: int) -> str:
         """Build navigation buttons for callers/callees."""
         line_key = (filename, line_num)
diff --git a/Lib/profiling/sampling/live_collector/collector.py b/Lib/profiling/sampling/live_collector/collector.py
index 7adbf1bbe7f..5edb02e6e88 100644
--- a/Lib/profiling/sampling/live_collector/collector.py
+++ b/Lib/profiling/sampling/live_collector/collector.py
@@ -103,6 +103,7 @@ def __init__(
         pid=None,
         display=None,
         mode=None,
+        async_aware=None,
     ):
         """
         Initialize the live stats collector.
@@ -115,6 +116,7 @@ def __init__(
             pid: Process ID being profiled
             display: DisplayInterface implementation (None means curses will be used)
             mode: Profiling mode ('cpu', 'gil', etc.) - affects what stats are shown
+            async_aware: Async tracing mode - None (sync only), "all" or "running"
         """
         self.result = collections.defaultdict(
             lambda: dict(total_rec_calls=0, direct_calls=0, cumulative_calls=0)
@@ -133,6 +135,9 @@ def __init__(
         self.running = True
         self.pid = pid
         self.mode = mode  # Profiling mode
+        self.async_aware = async_aware  # Async tracing mode
+        # Pre-select frame iterator method to avoid per-call dispatch overhead
+        self._get_frame_iterator = self._get_async_frame_iterator if async_aware else self._get_sync_frame_iterator
         self._saved_stdout = None
         self._saved_stderr = None
         self._devnull = None
@@ -294,6 +299,15 @@ def process_frames(self, frames, thread_id=None):
         if thread_data:
             thread_data.result[top_location]["direct_calls"] += 1
 
+    def _get_sync_frame_iterator(self, stack_frames):
+        """Iterator for sync frames."""
+        return self._iter_all_frames(stack_frames, skip_idle=self.skip_idle)
+
+    def _get_async_frame_iterator(self, stack_frames):
+        """Iterator for async frames, yielding (frames, thread_id) tuples."""
+        for frames, thread_id, task_id in self._iter_async_frames(stack_frames):
+            yield frames, thread_id
+
     def collect_failed_sample(self):
         self.failed_samples += 1
         self.total_samples += 1
@@ -304,78 +318,40 @@ def collect(self, stack_frames):
             self.start_time = time.perf_counter()
             self._last_display_update = self.start_time
 
-        # Thread status counts for this sample
-        temp_status_counts = {
-            "has_gil": 0,
-            "on_cpu": 0,
-            "gil_requested": 0,
-            "unknown": 0,
-            "total": 0,
-        }
         has_gc_frame = False
 
-        # Always collect data, even when paused
-        # Track thread status flags and GC frames
-        for interpreter_info in stack_frames:
-            threads = getattr(interpreter_info, "threads", [])
-            for thread_info in threads:
-                temp_status_counts["total"] += 1
+        # Collect thread status stats (only available in sync mode)
+        if not self.async_aware:
+            status_counts, sample_has_gc, per_thread_stats = self._collect_thread_status_stats(stack_frames)
+            for key, count in status_counts.items():
+                self.thread_status_counts[key] += count
+            if sample_has_gc:
+                has_gc_frame = True
 
-                # Track thread status using bit flags
-                status_flags = getattr(thread_info, "status", 0)
-                thread_id = getattr(thread_info, "thread_id", None)
+            for thread_id, stats in per_thread_stats.items():
+                thread_data = self._get_or_create_thread_data(thread_id)
+                thread_data.has_gil += stats.get("has_gil", 0)
+                thread_data.on_cpu += stats.get("on_cpu", 0)
+                thread_data.gil_requested += stats.get("gil_requested", 0)
+                thread_data.unknown += stats.get("unknown", 0)
+                thread_data.total += stats.get("total", 0)
+                if stats.get("gc_samples", 0):
+                    thread_data.gc_frame_samples += stats["gc_samples"]
 
-                # Update aggregated counts
-                if status_flags & THREAD_STATUS_HAS_GIL:
-                    temp_status_counts["has_gil"] += 1
-                if status_flags & THREAD_STATUS_ON_CPU:
-                    temp_status_counts["on_cpu"] += 1
-                if status_flags & THREAD_STATUS_GIL_REQUESTED:
-                    temp_status_counts["gil_requested"] += 1
-                if status_flags & THREAD_STATUS_UNKNOWN:
-                    temp_status_counts["unknown"] += 1
+        # Process frames using pre-selected iterator
+        for frames, thread_id in self._get_frame_iterator(stack_frames):
+            if not frames:
+                continue
 
-                # Update per-thread status counts
-                if thread_id is not None:
-                    thread_data = self._get_or_create_thread_data(thread_id)
-                    thread_data.increment_status_flag(status_flags)
+            self.process_frames(frames, thread_id=thread_id)
 
-                # Process frames (respecting skip_idle)
-                if self.skip_idle:
-                    has_gil = bool(status_flags & THREAD_STATUS_HAS_GIL)
-                    on_cpu = bool(status_flags & THREAD_STATUS_ON_CPU)
-                    if not (has_gil or on_cpu):
-                        continue
+            # Track thread IDs
+            if thread_id is not None and thread_id not in self.thread_ids:
+                self.thread_ids.append(thread_id)
 
-                frames = getattr(thread_info, "frame_info", None)
-                if frames:
-                    self.process_frames(frames, thread_id=thread_id)
-
-                    # Track thread IDs only for threads that actually have samples
-                    if (
-                        thread_id is not None
-                        and thread_id not in self.thread_ids
-                    ):
-                        self.thread_ids.append(thread_id)
-
-                    # Increment per-thread sample count and check for GC frames
-                    thread_has_gc_frame = False
-                    for frame in frames:
-                        funcname = getattr(frame, "funcname", "")
-                        if "<GC>" in funcname or "gc_collect" in funcname:
-                            has_gc_frame = True
-                            thread_has_gc_frame = True
-                            break
-
-                    if thread_id is not None:
-                        thread_data = self._get_or_create_thread_data(thread_id)
-                        thread_data.sample_count += 1
-                        if thread_has_gc_frame:
-                            thread_data.gc_frame_samples += 1
-
-        # Update cumulative thread status counts
-        for key, count in temp_status_counts.items():
-            self.thread_status_counts[key] += count
+            if thread_id is not None:
+                thread_data = self._get_or_create_thread_data(thread_id)
+                thread_data.sample_count += 1
 
         if has_gc_frame:
             self.gc_frame_samples += 1
diff --git a/Lib/profiling/sampling/pstats_collector.py b/Lib/profiling/sampling/pstats_collector.py
index b8b37a10c43..4fe3acfa9ff 100644
--- a/Lib/profiling/sampling/pstats_collector.py
+++ b/Lib/profiling/sampling/pstats_collector.py
@@ -42,8 +42,14 @@ def _process_frames(self, frames):
             self.callers[callee][caller] += 1
 
     def collect(self, stack_frames):
-        for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=self.skip_idle):
-            self._process_frames(frames)
+        if stack_frames and hasattr(stack_frames[0], "awaited_by"):
+            # Async frame processing
+            for frames, thread_id, task_id in self._iter_async_frames(stack_frames):
+                self._process_frames(frames)
+        else:
+            # Regular frame processing
+            for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=self.skip_idle):
+                self._process_frames(frames)
 
     def export(self, filename):
         self.create_stats()
diff --git a/Lib/profiling/sampling/sample.py b/Lib/profiling/sampling/sample.py
index 46fc1a05afa..dd4ea1edbf6 100644
--- a/Lib/profiling/sampling/sample.py
+++ b/Lib/profiling/sampling/sample.py
@@ -27,28 +27,31 @@
 
 
 class SampleProfiler:
-    def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True):
+    def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, native=False, gc=True, skip_non_matching_threads=True, collect_stats=False):
         self.pid = pid
         self.sample_interval_usec = sample_interval_usec
         self.all_threads = all_threads
         self.mode = mode  # Store mode for later use
+        self.collect_stats = collect_stats
         if _FREE_THREADED_BUILD:
             self.unwinder = _remote_debugging.RemoteUnwinder(
                 self.pid, all_threads=self.all_threads, mode=mode, native=native, gc=gc,
-                skip_non_matching_threads=skip_non_matching_threads
+                skip_non_matching_threads=skip_non_matching_threads, cache_frames=True,
+                stats=collect_stats
             )
         else:
             only_active_threads = bool(self.all_threads)
             self.unwinder = _remote_debugging.RemoteUnwinder(
                 self.pid, only_active_thread=only_active_threads, mode=mode, native=native, gc=gc,
-                skip_non_matching_threads=skip_non_matching_threads
+                skip_non_matching_threads=skip_non_matching_threads, cache_frames=True,
+                stats=collect_stats
             )
         # Track sample intervals and total sample count
         self.sample_intervals = deque(maxlen=100)
         self.total_samples = 0
         self.realtime_stats = False
 
-    def sample(self, collector, duration_sec=10):
+    def sample(self, collector, duration_sec=10, *, async_aware=False):
         sample_interval_sec = self.sample_interval_usec / 1_000_000
         running_time = 0
         num_samples = 0
@@ -68,7 +71,12 @@ def sample(self, collector, duration_sec=10):
                 current_time = time.perf_counter()
                 if next_time < current_time:
                     try:
-                        stack_frames = self.unwinder.get_stack_trace()
+                        if async_aware == "all":
+                            stack_frames = self.unwinder.get_all_awaited_by()
+                        elif async_aware == "running":
+                            stack_frames = self.unwinder.get_async_stack_trace()
+                        else:
+                            stack_frames = self.unwinder.get_stack_trace()
                         collector.collect(stack_frames)
                     except ProcessLookupError:
                         duration_sec = current_time - start_time
@@ -124,6 +132,10 @@ def sample(self, collector, duration_sec=10):
             print(f"Sample rate: {sample_rate:.2f} samples/sec")
             print(f"Error rate: {error_rate:.2f}%")
 
+            # Print unwinder stats if stats collection is enabled
+            if self.collect_stats:
+                self._print_unwinder_stats()
+
         # Pass stats to flamegraph collector if it's the right type
         if hasattr(collector, 'set_stats'):
             collector.set_stats(self.sample_interval_usec, running_time, sample_rate, error_rate, missed_samples, mode=self.mode)
@@ -171,17 +183,100 @@ def _print_realtime_stats(self):
             (1.0 / min_hz) * 1_000_000 if min_hz > 0 else 0
         )  # Max time = Min Hz
 
+        # Build cache stats string if stats collection is enabled
+        cache_stats_str = ""
+        if self.collect_stats:
+            try:
+                stats = self.unwinder.get_stats()
+                hits = stats.get('frame_cache_hits', 0)
+                partial = stats.get('frame_cache_partial_hits', 0)
+                misses = stats.get('frame_cache_misses', 0)
+                total = hits + partial + misses
+                if total > 0:
+                    hit_pct = (hits + partial) / total * 100
+                    cache_stats_str = f" {ANSIColors.MAGENTA}Cache: {hit_pct:.1f}% ({hits}+{partial}/{misses}){ANSIColors.RESET}"
+            except RuntimeError:
+                pass
+
         # Clear line and print stats
         print(
-            f"\r\033[K{ANSIColors.BOLD_BLUE}Real-time sampling stats:{ANSIColors.RESET} "
-            f"{ANSIColors.YELLOW}Mean: {mean_hz:.1f}Hz ({mean_us_per_sample:.2f}µs){ANSIColors.RESET} "
-            f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz ({max_us_per_sample:.2f}µs){ANSIColors.RESET} "
-            f"{ANSIColors.RED}Max: {max_hz:.1f}Hz ({min_us_per_sample:.2f}µs){ANSIColors.RESET} "
-            f"{ANSIColors.CYAN}Samples: {self.total_samples}{ANSIColors.RESET}",
+            f"\r\033[K{ANSIColors.BOLD_BLUE}Stats:{ANSIColors.RESET} "
+            f"{ANSIColors.YELLOW}{mean_hz:.1f}Hz ({mean_us_per_sample:.1f}µs){ANSIColors.RESET} "
+            f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz{ANSIColors.RESET} "
+            f"{ANSIColors.RED}Max: {max_hz:.1f}Hz{ANSIColors.RESET} "
+            f"{ANSIColors.CYAN}N={self.total_samples}{ANSIColors.RESET}"
+            f"{cache_stats_str}",
             end="",
             flush=True,
         )
 
+    def _print_unwinder_stats(self):
+        """Print unwinder statistics including cache performance."""
+        try:
+            stats = self.unwinder.get_stats()
+        except RuntimeError:
+            return  # Stats not enabled
+
+        print(f"\n{ANSIColors.BOLD_BLUE}{'='*50}{ANSIColors.RESET}")
+        print(f"{ANSIColors.BOLD_BLUE}Unwinder Statistics:{ANSIColors.RESET}")
+
+        # Frame cache stats
+        total_samples = stats.get('total_samples', 0)
+        frame_cache_hits = stats.get('frame_cache_hits', 0)
+        frame_cache_partial_hits = stats.get('frame_cache_partial_hits', 0)
+        frame_cache_misses = stats.get('frame_cache_misses', 0)
+        total_lookups = frame_cache_hits + frame_cache_partial_hits + frame_cache_misses
+
+        # Calculate percentages
+        hits_pct = (frame_cache_hits / total_lookups * 100) if total_lookups > 0 else 0
+        partial_pct = (frame_cache_partial_hits / total_lookups * 100) if total_lookups > 0 else 0
+        misses_pct = (frame_cache_misses / total_lookups * 100) if total_lookups > 0 else 0
+
+        print(f"  {ANSIColors.CYAN}Frame Cache:{ANSIColors.RESET}")
+        print(f"    Total samples:    {total_samples:,}")
+        print(f"    Full hits:        {frame_cache_hits:,} ({ANSIColors.GREEN}{hits_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    Partial hits:     {frame_cache_partial_hits:,} ({ANSIColors.YELLOW}{partial_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    Misses:           {frame_cache_misses:,} ({ANSIColors.RED}{misses_pct:.1f}%{ANSIColors.RESET})")
+
+        # Frame read stats
+        frames_from_cache = stats.get('frames_read_from_cache', 0)
+        frames_from_memory = stats.get('frames_read_from_memory', 0)
+        total_frames = frames_from_cache + frames_from_memory
+        cache_frame_pct = (frames_from_cache / total_frames * 100) if total_frames > 0 else 0
+        memory_frame_pct = (frames_from_memory / total_frames * 100) if total_frames > 0 else 0
+
+        print(f"  {ANSIColors.CYAN}Frame Reads:{ANSIColors.RESET}")
+        print(f"    From cache:       {frames_from_cache:,} ({ANSIColors.GREEN}{cache_frame_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    From memory:      {frames_from_memory:,} ({ANSIColors.RED}{memory_frame_pct:.1f}%{ANSIColors.RESET})")
+
+        # Code object cache stats
+        code_hits = stats.get('code_object_cache_hits', 0)
+        code_misses = stats.get('code_object_cache_misses', 0)
+        total_code = code_hits + code_misses
+        code_hits_pct = (code_hits / total_code * 100) if total_code > 0 else 0
+        code_misses_pct = (code_misses / total_code * 100) if total_code > 0 else 0
+
+        print(f"  {ANSIColors.CYAN}Code Object Cache:{ANSIColors.RESET}")
+        print(f"    Hits:             {code_hits:,} ({ANSIColors.GREEN}{code_hits_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    Misses:           {code_misses:,} ({ANSIColors.RED}{code_misses_pct:.1f}%{ANSIColors.RESET})")
+
+        # Memory operations
+        memory_reads = stats.get('memory_reads', 0)
+        memory_bytes = stats.get('memory_bytes_read', 0)
+        if memory_bytes >= 1024 * 1024:
+            memory_str = f"{memory_bytes / (1024 * 1024):.1f} MB"
+        elif memory_bytes >= 1024:
+            memory_str = f"{memory_bytes / 1024:.1f} KB"
+        else:
+            memory_str = f"{memory_bytes} B"
+        print(f"  {ANSIColors.CYAN}Memory:{ANSIColors.RESET}")
+        print(f"    Read operations:  {memory_reads:,} ({memory_str})")
+
+        # Stale invalidations
+        stale_invalidations = stats.get('stale_cache_invalidations', 0)
+        if stale_invalidations > 0:
+            print(f"  {ANSIColors.YELLOW}Stale cache invalidations: {stale_invalidations}{ANSIColors.RESET}")
+
 
 def sample(
     pid,
@@ -191,6 +286,7 @@ def sample(
     all_threads=False,
     realtime_stats=False,
     mode=PROFILING_MODE_WALL,
+    async_aware=None,
     native=False,
     gc=True,
 ):
@@ -228,12 +324,13 @@ def sample(
         mode=mode,
         native=native,
         gc=gc,
-        skip_non_matching_threads=skip_non_matching_threads
+        skip_non_matching_threads=skip_non_matching_threads,
+        collect_stats=realtime_stats,
     )
     profiler.realtime_stats = realtime_stats
 
     # Run the sampling
-    profiler.sample(collector, duration_sec)
+    profiler.sample(collector, duration_sec, async_aware=async_aware)
 
     return collector
 
@@ -246,6 +343,7 @@ def sample_live(
     all_threads=False,
     realtime_stats=False,
     mode=PROFILING_MODE_WALL,
+    async_aware=None,
     native=False,
     gc=True,
 ):
@@ -283,14 +381,15 @@ def sample_live(
         mode=mode,
         native=native,
         gc=gc,
-        skip_non_matching_threads=skip_non_matching_threads
+        skip_non_matching_threads=skip_non_matching_threads,
+        collect_stats=realtime_stats,
     )
     profiler.realtime_stats = realtime_stats
 
     def curses_wrapper_func(stdscr):
         collector.init_curses(stdscr)
         try:
-            profiler.sample(collector, duration_sec)
+            profiler.sample(collector, duration_sec, async_aware=async_aware)
             # Mark as finished and keep the TUI running until user presses 'q'
             collector.mark_finished()
             # Keep processing input until user quits
diff --git a/Lib/profiling/sampling/stack_collector.py b/Lib/profiling/sampling/stack_collector.py
index e2653609313..1f766682858 100644
--- a/Lib/profiling/sampling/stack_collector.py
+++ b/Lib/profiling/sampling/stack_collector.py
@@ -17,10 +17,18 @@ def __init__(self, sample_interval_usec, *, skip_idle=False):
         self.skip_idle = skip_idle
 
     def collect(self, stack_frames, skip_idle=False):
-        for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=skip_idle):
-            if not frames:
-                continue
-            self.process_frames(frames, thread_id)
+        if stack_frames and hasattr(stack_frames[0], "awaited_by"):
+            # Async-aware mode: process async task frames
+            for frames, thread_id, task_id in self._iter_async_frames(stack_frames):
+                if not frames:
+                    continue
+                self.process_frames(frames, thread_id)
+        else:
+            # Sync-only mode
+            for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=skip_idle):
+                if not frames:
+                    continue
+                self.process_frames(frames, thread_id)
 
     def process_frames(self, frames, thread_id):
         pass
diff --git a/Lib/test/test_argparse.py b/Lib/test/test_argparse.py
index dff7ba750fa..7c5eed21219 100644
--- a/Lib/test/test_argparse.py
+++ b/Lib/test/test_argparse.py
@@ -2770,6 +2770,16 @@ def test_optional_subparsers(self):
         ret = parser.parse_args(())
         self.assertIsNone(ret.command)
 
+    def test_subparser_help_with_parent_required_optional(self):
+        parser = ErrorRaisingArgumentParser(prog='PROG')
+        parser.add_argument('--foo', required=True)
+        parser.add_argument('--bar')
+        subparsers = parser.add_subparsers()
+        parser_sub = subparsers.add_parser('sub')
+        parser_sub.add_argument('arg')
+        self.assertEqual(parser_sub.format_usage(),
+                         'usage: PROG --foo FOO sub [-h] arg\n')
+
     def test_help(self):
         self.assertEqual(self.parser.format_usage(),
                          'usage: PROG [-h] [--foo] bar {1,2,3} ...\n')
@@ -3388,12 +3398,11 @@ def test_help_subparser_all_mutually_exclusive_group_members_suppressed(self):
               '''
         self.assertEqual(cmd_foo.format_help(), textwrap.dedent(expected))
 
-    def test_empty_group(self):
+    def test_usage_empty_group(self):
         # See issue 26952
-        parser = argparse.ArgumentParser()
+        parser = ErrorRaisingArgumentParser(prog='PROG')
         group = parser.add_mutually_exclusive_group()
-        with self.assertRaises(ValueError):
-            parser.parse_args(['-h'])
+        self.assertEqual(parser.format_usage(), 'usage: PROG [-h]\n')
 
     def test_nested_mutex_groups(self):
         parser = argparse.ArgumentParser(prog='PROG')
@@ -3661,25 +3670,29 @@ def get_parser(self, required):
         group.add_argument('-b', action='store_true', help='b help')
         parser.add_argument('-y', action='store_true', help='y help')
         group.add_argument('-c', action='store_true', help='c help')
+        parser.add_argument('-z', action='store_true', help='z help')
         return parser
 
     failures = ['-a -b', '-b -c', '-a -c', '-a -b -c']
     successes = [
-        ('-a', NS(a=True, b=False, c=False, x=False, y=False)),
-        ('-b', NS(a=False, b=True, c=False, x=False, y=False)),
-        ('-c', NS(a=False, b=False, c=True, x=False, y=False)),
-        ('-a -x', NS(a=True, b=False, c=False, x=True, y=False)),
-        ('-y -b', NS(a=False, b=True, c=False, x=False, y=True)),
-        ('-x -y -c', NS(a=False, b=False, c=True, x=True, y=True)),
+        ('-a', NS(a=True, b=False, c=False, x=False, y=False, z=False)),
+        ('-b', NS(a=False, b=True, c=False, x=False, y=False, z=False)),
+        ('-c', NS(a=False, b=False, c=True, x=False, y=False, z=False)),
+        ('-a -x', NS(a=True, b=False, c=False, x=True, y=False, z=False)),
+        ('-y -b', NS(a=False, b=True, c=False, x=False, y=True, z=False)),
+        ('-x -y -c', NS(a=False, b=False, c=True, x=True, y=True, z=False)),
     ]
     successes_when_not_required = [
-        ('', NS(a=False, b=False, c=False, x=False, y=False)),
-        ('-x', NS(a=False, b=False, c=False, x=True, y=False)),
-        ('-y', NS(a=False, b=False, c=False, x=False, y=True)),
+        ('', NS(a=False, b=False, c=False, x=False, y=False, z=False)),
+        ('-x', NS(a=False, b=False, c=False, x=True, y=False, z=False)),
+        ('-y', NS(a=False, b=False, c=False, x=False, y=True, z=False)),
     ]
 
-    usage_when_required = usage_when_not_required = '''\
-        usage: PROG [-h] [-x] [-a] [-b] [-y] [-c]
+    usage_when_not_required = '''\
+        usage: PROG [-h] [-x] [-a | -b | -c] [-y] [-z]
+        '''
+    usage_when_required = '''\
+        usage: PROG [-h] [-x] (-a | -b | -c) [-y] [-z]
         '''
     help = '''\
 
@@ -3690,6 +3703,7 @@ def get_parser(self, required):
           -b          b help
           -y          y help
           -c          c help
+          -z          z help
         '''
 
 
@@ -3743,23 +3757,27 @@ def get_parser(self, required):
         group.add_argument('a', nargs='?', help='a help')
         group.add_argument('-b', action='store_true', help='b help')
         group.add_argument('-c', action='store_true', help='c help')
+        parser.add_argument('-z', action='store_true', help='z help')
         return parser
 
     failures = ['X A -b', '-b -c', '-c X A']
     successes = [
-        ('X A', NS(a='A', b=False, c=False, x='X', y=False)),
-        ('X -b', NS(a=None, b=True, c=False, x='X', y=False)),
-        ('X -c', NS(a=None, b=False, c=True, x='X', y=False)),
-        ('X A -y', NS(a='A', b=False, c=False, x='X', y=True)),
-        ('X -y -b', NS(a=None, b=True, c=False, x='X', y=True)),
+        ('X A', NS(a='A', b=False, c=False, x='X', y=False, z=False)),
+        ('X -b', NS(a=None, b=True, c=False, x='X', y=False, z=False)),
+        ('X -c', NS(a=None, b=False, c=True, x='X', y=False, z=False)),
+        ('X A -y', NS(a='A', b=False, c=False, x='X', y=True, z=False)),
+        ('X -y -b', NS(a=None, b=True, c=False, x='X', y=True, z=False)),
     ]
     successes_when_not_required = [
-        ('X', NS(a=None, b=False, c=False, x='X', y=False)),
-        ('X -y', NS(a=None, b=False, c=False, x='X', y=True)),
+        ('X', NS(a=None, b=False, c=False, x='X', y=False, z=False)),
+        ('X -y', NS(a=None, b=False, c=False, x='X', y=True, z=False)),
     ]
 
-    usage_when_required = usage_when_not_required = '''\
-        usage: PROG [-h] [-y] [-b] [-c] x [a]
+    usage_when_not_required = '''\
+        usage: PROG [-h] [-y] [-z] x [-b | -c | a]
+        '''
+    usage_when_required = '''\
+        usage: PROG [-h] [-y] [-z] x (-b | -c | a)
         '''
     help = '''\
 
@@ -3772,6 +3790,7 @@ def get_parser(self, required):
           -y          y help
           -b          b help
           -c          c help
+          -z          z help
         '''
 
 
@@ -4979,9 +4998,9 @@ def test_mutex_groups_with_mixed_optionals_positionals_wrap(self):
         g.add_argument('positional', nargs='?')
 
         usage = textwrap.dedent('''\
-        usage: PROG [-h] [-v | -q | -x [EXTRA_LONG_OPTION_NAME] |
-                    -y [YET_ANOTHER_LONG_OPTION] |
-                    positional]
+        usage: PROG [-h]
+                    [-v | -q | -x [EXTRA_LONG_OPTION_NAME] |
+                    -y [YET_ANOTHER_LONG_OPTION] | positional]
         ''')
         self.assertEqual(parser.format_usage(), usage)
 
@@ -5675,6 +5694,11 @@ def custom_formatter(prog):
                 a-very-long-command  command that does something
         '''))
 
+    def test_direct_formatter_instantiation(self):
+        formatter = argparse.HelpFormatter(prog="program")
+        formatter.add_usage(usage=None, actions=[], groups=[])
+        help_text = formatter.format_help()
+        self.assertEqual(help_text, "usage: program\n")
 
 # =====================================
 # Optional/Positional constructor tests
@@ -7361,7 +7385,28 @@ def test_argparse_color(self):
             ),
         )
 
-    def test_argparse_color_usage(self):
+    def test_argparse_color_mutually_exclusive_group_usage(self):
+        parser = argparse.ArgumentParser(color=True, prog="PROG")
+        group = parser.add_mutually_exclusive_group()
+        group.add_argument('--foo', action='store_true', help='FOO')
+        group.add_argument('--spam', help='SPAM')
+        group.add_argument('badger', nargs='*', help='BADGER')
+
+        prog = self.theme.prog
+        heading = self.theme.heading
+        long = self.theme.summary_long_option
+        short = self.theme.summary_short_option
+        label = self.theme.summary_label
+        pos = self.theme.summary_action
+        reset = self.theme.reset
+
+        self.assertEqual(parser.format_usage(),
+            f"{heading}usage: {reset}{prog}PROG{reset} [{short}-h{reset}] "
+            f"[{long}--foo{reset} | "
+            f"{long}--spam {label}SPAM{reset} | "
+            f"{pos}badger ...{reset}]\n")
+
+    def test_argparse_color_custom_usage(self):
         # Arrange
         parser = argparse.ArgumentParser(
             add_help=False,
diff --git a/Lib/test/test_ast/test_ast.py b/Lib/test/test_ast/test_ast.py
index 12cae3c20fc..1eaf7f15bff 100644
--- a/Lib/test/test_ast/test_ast.py
+++ b/Lib/test/test_ast/test_ast.py
@@ -992,7 +992,8 @@ def next(self):
     @skip_wasi_stack_overflow()
     @skip_emscripten_stack_overflow()
     def test_ast_recursion_limit(self):
-        crash_depth = 500_000
+        # Android test devices have less memory.
+        crash_depth = 100_000 if sys.platform == "android" else 500_000
         success_depth = 200
         if _testinternalcapi is not None:
             remaining = _testinternalcapi.get_c_recursion_remaining()
diff --git a/Lib/test/test_asyncio/test_tasks.py b/Lib/test/test_asyncio/test_tasks.py
index 931a43816a2..9809621a324 100644
--- a/Lib/test/test_asyncio/test_tasks.py
+++ b/Lib/test/test_asyncio/test_tasks.py
@@ -3680,6 +3680,30 @@ def task_factory(loop, coro):
         (loop, context), kwargs = callback.call_args
         self.assertEqual(context['exception'], exc_context.exception)
 
+    def test_run_coroutine_threadsafe_and_cancel(self):
+        task = None
+        thread_future = None
+        # Use a custom task factory to capture the created Task
+        def task_factory(loop, coro):
+            nonlocal task
+            task = asyncio.Task(coro, loop=loop)
+            return task
+
+        self.addCleanup(self.loop.set_task_factory,
+                        self.loop.get_task_factory())
+
+        async def target():
+            nonlocal thread_future
+            self.loop.set_task_factory(task_factory)
+            thread_future = asyncio.run_coroutine_threadsafe(asyncio.sleep(10), self.loop)
+            await asyncio.sleep(0)
+
+            thread_future.cancel()
+
+        self.loop.run_until_complete(target())
+        self.assertTrue(task.cancelled())
+        self.assertTrue(thread_future.cancelled())
+
 
 class SleepTests(test_utils.TestCase):
     def setUp(self):
diff --git a/Lib/test/test_compile.py b/Lib/test/test_compile.py
index 30f21875b22..fa611f480d6 100644
--- a/Lib/test/test_compile.py
+++ b/Lib/test/test_compile.py
@@ -728,7 +728,8 @@ def test_yet_more_evil_still_undecodable(self):
     def test_compiler_recursion_limit(self):
         # Compiler frames are small
         limit = 100
-        crash_depth = limit * 5000
+        # Android test devices have less memory.
+        crash_depth = limit * (1000 if sys.platform == "android" else 5000)
         success_depth = limit
 
         def check_limit(prefix, repeated, mode="single"):
@@ -1036,11 +1037,13 @@ def test_path_like_objects(self):
         # An implicit test for PyUnicode_FSDecoder().
         compile("42", FakePath("test_compile_pathlike"), "single")
 
+    # bpo-31113: Stack overflow when compile a long sequence of
+    # complex statements.
     @support.requires_resource('cpu')
     def test_stack_overflow(self):
-        # bpo-31113: Stack overflow when compile a long sequence of
-        # complex statements.
-        compile("if a: b\n" * 200000, "<dummy>", "exec")
+        # Android test devices have less memory.
+        size = 100_000 if sys.platform == "android" else 200_000
+        compile("if a: b\n" * size, "<dummy>", "exec")
 
     # Multiple users rely on the fact that CPython does not generate
     # bytecode for dead code blocks. See bpo-37500 for more context.
diff --git a/Lib/test/test_doctest/test_doctest.py b/Lib/test/test_doctest/test_doctest.py
index 0fa74407e3c..241d09db1fa 100644
--- a/Lib/test/test_doctest/test_doctest.py
+++ b/Lib/test/test_doctest/test_doctest.py
@@ -833,6 +833,118 @@ def test_empty_namespace_package(self):
             self.assertEqual(len(include_empty_finder.find(mod)), 1)
             self.assertEqual(len(exclude_empty_finder.find(mod)), 0)
 
+    def test_lineno_of_test_dict_strings(self):
+        """Test line numbers are found for __test__ dict strings."""
+        module_content = '''\
+"""Module docstring."""
+
+def dummy_function():
+    """Dummy function docstring."""
+    pass
+
+__test__ = {
+    'test_string': """
+    This is a test string.
+    >>> 1 + 1
+    2
+    """,
+}
+'''
+        with tempfile.TemporaryDirectory() as tmpdir:
+            module_path = os.path.join(tmpdir, 'test_module_lineno.py')
+            with open(module_path, 'w') as f:
+                f.write(module_content)
+
+            sys.path.insert(0, tmpdir)
+            try:
+                import test_module_lineno
+                finder = doctest.DocTestFinder()
+                tests = finder.find(test_module_lineno)
+
+                test_dict_test = None
+                for test in tests:
+                    if '__test__' in test.name:
+                        test_dict_test = test
+                        break
+
+                self.assertIsNotNone(
+                    test_dict_test,
+                    "__test__ dict test not found"
+                )
+                # gh-69113: line number should not be None for __test__ strings
+                self.assertIsNotNone(
+                    test_dict_test.lineno,
+                    "Line number should not be None for __test__ dict strings"
+                )
+                self.assertGreater(
+                    test_dict_test.lineno,
+                    0,
+                    "Line number should be positive"
+                )
+            finally:
+                if 'test_module_lineno' in sys.modules:
+                    del sys.modules['test_module_lineno']
+                sys.path.pop(0)
+
+    def test_lineno_multiline_matching(self):
+        """Test multi-line matching when no unique line exists."""
+        # gh-69113: test that line numbers are found even when lines
+        # appear multiple times (e.g., ">>> x = 1" in both test entries)
+        module_content = '''\
+"""Module docstring."""
+
+__test__ = {
+    'test_one': """
+    >>> x = 1
+    >>> x
+    1
+    """,
+    'test_two': """
+    >>> x = 1
+    >>> x
+    2
+    """,
+}
+'''
+        with tempfile.TemporaryDirectory() as tmpdir:
+            module_path = os.path.join(tmpdir, 'test_module_multiline.py')
+            with open(module_path, 'w') as f:
+                f.write(module_content)
+
+            sys.path.insert(0, tmpdir)
+            try:
+                import test_module_multiline
+                finder = doctest.DocTestFinder()
+                tests = finder.find(test_module_multiline)
+
+                test_one = None
+                test_two = None
+                for test in tests:
+                    if 'test_one' in test.name:
+                        test_one = test
+                    elif 'test_two' in test.name:
+                        test_two = test
+
+                self.assertIsNotNone(test_one, "test_one not found")
+                self.assertIsNotNone(test_two, "test_two not found")
+                self.assertIsNotNone(
+                    test_one.lineno,
+                    "Line number should not be None for test_one"
+                )
+                self.assertIsNotNone(
+                    test_two.lineno,
+                    "Line number should not be None for test_two"
+                )
+                self.assertNotEqual(
+                    test_one.lineno,
+                    test_two.lineno,
+                    "test_one and test_two should have different line numbers"
+                )
+            finally:
+                if 'test_module_multiline' in sys.modules:
+                    del sys.modules['test_module_multiline']
+                sys.path.pop(0)
+
 def test_DocTestParser(): r"""
 Unit tests for the `DocTestParser` class.
 
@@ -2434,7 +2546,8 @@ def test_DocTestSuite_errors():
          <BLANKLINE>
          >>> print(result.failures[1][1]) # doctest: +ELLIPSIS
          Traceback (most recent call last):
-           File "...sample_doctest_errors.py", line None, in test.test_doctest.sample_doctest_errors.__test__.bad
+           File "...sample_doctest_errors.py", line 37, in test.test_doctest.sample_doctest_errors.__test__.bad
+             >...>> 2 + 2
          AssertionError: Failed example:
              2 + 2
          Expected:
@@ -2464,7 +2577,8 @@ def test_DocTestSuite_errors():
          <BLANKLINE>
          >>> print(result.errors[1][1]) # doctest: +ELLIPSIS
          Traceback (most recent call last):
-           File "...sample_doctest_errors.py", line None, in test.test_doctest.sample_doctest_errors.__test__.bad
+           File "...sample_doctest_errors.py", line 39, in test.test_doctest.sample_doctest_errors.__test__.bad
+             >...>> 1/0
            File "<doctest test.test_doctest.sample_doctest_errors.__test__.bad[1]>", line 1, in <module>
              1/0
              ~^~
@@ -3256,7 +3370,7 @@ def test_testmod_errors(): r"""
             ~^~
         ZeroDivisionError: division by zero
     **********************************************************************
-    File "...sample_doctest_errors.py", line ?, in test.test_doctest.sample_doctest_errors.__test__.bad
+    File "...sample_doctest_errors.py", line 37, in test.test_doctest.sample_doctest_errors.__test__.bad
     Failed example:
         2 + 2
     Expected:
@@ -3264,7 +3378,7 @@ def test_testmod_errors(): r"""
     Got:
         4
     **********************************************************************
-    File "...sample_doctest_errors.py", line ?, in test.test_doctest.sample_doctest_errors.__test__.bad
+    File "...sample_doctest_errors.py", line 39, in test.test_doctest.sample_doctest_errors.__test__.bad
     Failed example:
         1/0
     Exception raised:
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py
index 179e236ecdf..f7f9f9c4e2f 100644
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -3255,5 +3255,15 @@ def test_long_filename_attachment(self):
             " filename*1*=_TEST_TES.txt\n",
             )
 
+    def test_fold_unfoldable_element_stealing_whitespace(self):
+        # gh-142006: When an element is too long to fit on the current line
+        # the previous line's trailing whitespace should not trigger a double newline.
+        policy = self.policy.clone(max_line_length=10)
+        # The non-whitespace text needs to exactly fill the max_line_length (10).
+        text = ("a" * 9) + ", " + ("b" * 20)
+        expected = ("a" * 9) + ",\n " + ("b" * 20) + "\n"
+        token = parser.get_address_list(text)[0]
+        self._test(token, expected, policy=policy)
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/Lib/test/test_email/test_defect_handling.py b/Lib/test/test_email/test_defect_handling.py
index 44e76c8ce5e..acc4accccac 100644
--- a/Lib/test/test_email/test_defect_handling.py
+++ b/Lib/test/test_email/test_defect_handling.py
@@ -126,12 +126,10 @@ def test_multipart_invalid_cte(self):
             errors.InvalidMultipartContentTransferEncodingDefect)
 
     def test_multipart_no_cte_no_defect(self):
-        if self.raise_expected: return
         msg = self._str_msg(self.multipart_msg.format(''))
         self.assertEqual(len(self.get_defects(msg)), 0)
 
     def test_multipart_valid_cte_no_defect(self):
-        if self.raise_expected: return
         for cte in ('7bit', '8bit', 'BINary'):
             msg = self._str_msg(
                 self.multipart_msg.format("\nContent-Transfer-Encoding: "+cte))
@@ -300,6 +298,47 @@ def test_missing_ending_boundary(self):
         self.assertDefectsEqual(self.get_defects(msg),
                                 [errors.CloseBoundaryNotFoundDefect])
 
+    def test_line_beginning_colon(self):
+        string = (
+            "Subject: Dummy subject\r\n: faulty header line\r\n\r\nbody\r\n"
+        )
+
+        with self._raise_point(errors.InvalidHeaderDefect):
+            msg = self._str_msg(string)
+            self.assertEqual(len(self.get_defects(msg)), 1)
+            self.assertDefectsEqual(
+                self.get_defects(msg), [errors.InvalidHeaderDefect]
+            )
+
+            if msg:
+                self.assertEqual(msg.items(), [("Subject", "Dummy subject")])
+                self.assertEqual(msg.get_payload(), "body\r\n")
+
+    def test_misplaced_envelope(self):
+        string = (
+            "Subject: Dummy subject\r\nFrom wtf\r\nTo: abc\r\n\r\nbody\r\n"
+        )
+        with self._raise_point(errors.MisplacedEnvelopeHeaderDefect):
+            msg = self._str_msg(string)
+            self.assertEqual(len(self.get_defects(msg)), 1)
+            self.assertDefectsEqual(
+                self.get_defects(msg), [errors.MisplacedEnvelopeHeaderDefect]
+            )
+
+            if msg:
+                headers = [("Subject", "Dummy subject"), ("To", "abc")]
+                self.assertEqual(msg.items(), headers)
+                self.assertEqual(msg.get_payload(), "body\r\n")
+
+
+
+class TestCompat32(TestDefectsBase, TestEmailBase):
+
+    policy = policy.compat32
+
+    def get_defects(self, obj):
+        return obj.defects
+
 
 class TestDefectDetection(TestDefectsBase, TestEmailBase):
 
@@ -332,6 +371,9 @@ def _raise_point(self, defect):
         with self.assertRaises(defect):
             yield
 
+    def get_defects(self, obj):
+        return obj.defects
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py
index 4020f1041c4..4e6c213510c 100644
--- a/Lib/test/test_email/test_email.py
+++ b/Lib/test/test_email/test_email.py
@@ -2263,70 +2263,6 @@ def test_parse_missing_minor_type(self):
         eq(msg.get_content_maintype(), 'text')
         eq(msg.get_content_subtype(), 'plain')
 
-    # test_defect_handling
-    def test_same_boundary_inner_outer(self):
-        msg = self._msgobj('msg_15.txt')
-        # XXX We can probably eventually do better
-        inner = msg.get_payload(0)
-        self.assertHasAttr(inner, 'defects')
-        self.assertEqual(len(inner.defects), 1)
-        self.assertIsInstance(inner.defects[0],
-                              errors.StartBoundaryNotFoundDefect)
-
-    # test_defect_handling
-    def test_multipart_no_boundary(self):
-        msg = self._msgobj('msg_25.txt')
-        self.assertIsInstance(msg.get_payload(), str)
-        self.assertEqual(len(msg.defects), 2)
-        self.assertIsInstance(msg.defects[0],
-                              errors.NoBoundaryInMultipartDefect)
-        self.assertIsInstance(msg.defects[1],
-                              errors.MultipartInvariantViolationDefect)
-
-    multipart_msg = textwrap.dedent("""\
-        Date: Wed, 14 Nov 2007 12:56:23 GMT
-        From: foo@bar.invalid
-        To: foo@bar.invalid
-        Subject: Content-Transfer-Encoding: base64 and multipart
-        MIME-Version: 1.0
-        Content-Type: multipart/mixed;
-            boundary="===============3344438784458119861=="{}
-
-        --===============3344438784458119861==
-        Content-Type: text/plain
-
-        Test message
-
-        --===============3344438784458119861==
-        Content-Type: application/octet-stream
-        Content-Transfer-Encoding: base64
-
-        YWJj
-
-        --===============3344438784458119861==--
-        """)
-
-    # test_defect_handling
-    def test_multipart_invalid_cte(self):
-        msg = self._str_msg(
-            self.multipart_msg.format("\nContent-Transfer-Encoding: base64"))
-        self.assertEqual(len(msg.defects), 1)
-        self.assertIsInstance(msg.defects[0],
-            errors.InvalidMultipartContentTransferEncodingDefect)
-
-    # test_defect_handling
-    def test_multipart_no_cte_no_defect(self):
-        msg = self._str_msg(self.multipart_msg.format(''))
-        self.assertEqual(len(msg.defects), 0)
-
-    # test_defect_handling
-    def test_multipart_valid_cte_no_defect(self):
-        for cte in ('7bit', '8bit', 'BINary'):
-            msg = self._str_msg(
-                self.multipart_msg.format(
-                    "\nContent-Transfer-Encoding: {}".format(cte)))
-            self.assertEqual(len(msg.defects), 0)
-
     # test_headerregistry.TestContentTypeHeader invalid_1 and invalid_2.
     def test_invalid_content_type(self):
         eq = self.assertEqual
@@ -2403,30 +2339,6 @@ def test_missing_start_boundary(self):
         self.assertIsInstance(bad.defects[0],
                               errors.StartBoundaryNotFoundDefect)
 
-    # test_defect_handling
-    def test_first_line_is_continuation_header(self):
-        eq = self.assertEqual
-        m = ' Line 1\nSubject: test\n\nbody'
-        msg = email.message_from_string(m)
-        eq(msg.keys(), ['Subject'])
-        eq(msg.get_payload(), 'body')
-        eq(len(msg.defects), 1)
-        self.assertDefectsEqual(msg.defects,
-                                 [errors.FirstHeaderLineIsContinuationDefect])
-        eq(msg.defects[0].line, ' Line 1\n')
-
-    # test_defect_handling
-    def test_missing_header_body_separator(self):
-        # Our heuristic if we see a line that doesn't look like a header (no
-        # leading whitespace but no ':') is to assume that the blank line that
-        # separates the header from the body is missing, and to stop parsing
-        # headers and start parsing the body.
-        msg = self._str_msg('Subject: test\nnot a header\nTo: abc\n\nb\n')
-        self.assertEqual(msg.keys(), ['Subject'])
-        self.assertEqual(msg.get_payload(), 'not a header\nTo: abc\n\nb\n')
-        self.assertDefectsEqual(msg.defects,
-                                [errors.MissingHeaderBodySeparatorDefect])
-
     def test_string_payload_with_extra_space_after_cte(self):
         # https://github.com/python/cpython/issues/98188
         cte = "base64 "
diff --git a/Lib/test/test_exception_group.py b/Lib/test/test_exception_group.py
index 5df2c41c6b5..ace7ec72917 100644
--- a/Lib/test/test_exception_group.py
+++ b/Lib/test/test_exception_group.py
@@ -1,4 +1,4 @@
-import collections.abc
+import collections
 import types
 import unittest
 from test.support import skip_emscripten_stack_overflow, skip_wasi_stack_overflow, exceeds_recursion_limit
@@ -193,6 +193,77 @@ class MyEG(ExceptionGroup):
                       "MyEG('flat', [ValueError(1), TypeError(2)]), "
                       "TypeError(2)])"))
 
+    def test_exceptions_mutation(self):
+        class MyEG(ExceptionGroup):
+            pass
+
+        excs = [ValueError(1), TypeError(2)]
+        eg = MyEG('test', excs)
+
+        self.assertEqual(repr(eg), "MyEG('test', [ValueError(1), TypeError(2)])")
+        excs.clear()
+
+        # Ensure that clearing the exceptions sequence doesn't change the repr.
+        self.assertEqual(repr(eg), "MyEG('test', [ValueError(1), TypeError(2)])")
+
+        # Ensure that the args are still as passed.
+        self.assertEqual(eg.args, ('test', []))
+
+        excs = (ValueError(1), KeyboardInterrupt(2))
+        eg = BaseExceptionGroup('test', excs)
+
+        # Ensure that immutable sequences still work fine.
+        self.assertEqual(
+            repr(eg),
+            "BaseExceptionGroup('test', (ValueError(1), KeyboardInterrupt(2)))"
+        )
+
+        # Test non-standard custom sequences.
+        excs = collections.deque([ValueError(1), TypeError(2)])
+        eg = ExceptionGroup('test', excs)
+
+        self.assertEqual(
+            repr(eg),
+            "ExceptionGroup('test', deque([ValueError(1), TypeError(2)]))"
+        )
+        excs.clear()
+
+        # Ensure that clearing the exceptions sequence doesn't change the repr.
+        self.assertEqual(
+            repr(eg),
+            "ExceptionGroup('test', deque([ValueError(1), TypeError(2)]))"
+        )
+
+    def test_repr_raises(self):
+        class MySeq(collections.abc.Sequence):
+            def __init__(self, raises):
+                self.raises = raises
+
+            def __len__(self):
+                return 1
+
+            def __getitem__(self, index):
+                if index == 0:
+                    return ValueError(1)
+                raise IndexError
+
+            def __repr__(self):
+                if self.raises:
+                    raise self.raises
+                return None
+
+        seq = MySeq(None)
+        with self.assertRaisesRegex(
+            TypeError,
+            r".*MySeq\.__repr__\(\) must return a str, not NoneType"
+        ):
+            ExceptionGroup("test", seq)
+
+        seq = MySeq(ValueError)
+        with self.assertRaises(ValueError):
+            BaseExceptionGroup("test", seq)
+
+
 
 def create_simple_eg():
     excs = []
diff --git a/Lib/test/test_external_inspection.py b/Lib/test/test_external_inspection.py
index 7decd8f32d5..a97242483a8 100644
--- a/Lib/test/test_external_inspection.py
+++ b/Lib/test/test_external_inspection.py
@@ -1,11 +1,13 @@
 import unittest
 import os
 import textwrap
+import contextlib
 import importlib
 import sys
 import socket
 import threading
 import time
+from contextlib import contextmanager
 from asyncio import staggered, taskgroups, base_events, tasks
 from unittest.mock import ANY
 from test.support import (
@@ -26,9 +28,12 @@
 PROFILING_MODE_ALL = 3
 
 # Thread status flags
-THREAD_STATUS_HAS_GIL = (1 << 0)
-THREAD_STATUS_ON_CPU = (1 << 1)
-THREAD_STATUS_UNKNOWN = (1 << 2)
+THREAD_STATUS_HAS_GIL = 1 << 0
+THREAD_STATUS_ON_CPU = 1 << 1
+THREAD_STATUS_UNKNOWN = 1 << 2
+
+# Maximum number of retry attempts for operations that may fail transiently
+MAX_TRIES = 10
 
 try:
     from concurrent import interpreters
@@ -47,12 +52,149 @@
     )
 
 
+# ============================================================================
+# Module-level helper functions
+# ============================================================================
+
+
 def _make_test_script(script_dir, script_basename, source):
     to_return = make_script(script_dir, script_basename, source)
     importlib.invalidate_caches()
     return to_return
 
 
+def _create_server_socket(port, backlog=1):
+    """Create and configure a server socket for test communication."""
+    server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    server_socket.bind(("localhost", port))
+    server_socket.settimeout(SHORT_TIMEOUT)
+    server_socket.listen(backlog)
+    return server_socket
+
+
+def _wait_for_signal(sock, expected_signals, timeout=SHORT_TIMEOUT):
+    """
+    Wait for expected signal(s) from a socket with proper timeout and EOF handling.
+
+    Args:
+        sock: Connected socket to read from
+        expected_signals: Single bytes object or list of bytes objects to wait for
+        timeout: Socket timeout in seconds
+
+    Returns:
+        bytes: Complete accumulated response buffer
+
+    Raises:
+        RuntimeError: If connection closed before signal received or timeout
+    """
+    if isinstance(expected_signals, bytes):
+        expected_signals = [expected_signals]
+
+    sock.settimeout(timeout)
+    buffer = b""
+
+    while True:
+        # Check if all expected signals are in buffer
+        if all(sig in buffer for sig in expected_signals):
+            return buffer
+
+        try:
+            chunk = sock.recv(4096)
+            if not chunk:
+                # EOF - connection closed
+                raise RuntimeError(
+                    f"Connection closed before receiving expected signals. "
+                    f"Expected: {expected_signals}, Got: {buffer[-200:]!r}"
+                )
+            buffer += chunk
+        except socket.timeout:
+            raise RuntimeError(
+                f"Timeout waiting for signals. "
+                f"Expected: {expected_signals}, Got: {buffer[-200:]!r}"
+            )
+
+
+def _wait_for_n_signals(sock, signal_pattern, count, timeout=SHORT_TIMEOUT):
+    """
+    Wait for N occurrences of a signal pattern.
+
+    Args:
+        sock: Connected socket to read from
+        signal_pattern: bytes pattern to count (e.g., b"ready")
+        count: Number of occurrences expected
+        timeout: Socket timeout in seconds
+
+    Returns:
+        bytes: Complete accumulated response buffer
+
+    Raises:
+        RuntimeError: If connection closed or timeout before receiving all signals
+    """
+    sock.settimeout(timeout)
+    buffer = b""
+    found_count = 0
+
+    while found_count < count:
+        try:
+            chunk = sock.recv(4096)
+            if not chunk:
+                raise RuntimeError(
+                    f"Connection closed after {found_count}/{count} signals. "
+                    f"Last 200 bytes: {buffer[-200:]!r}"
+                )
+            buffer += chunk
+            # Count occurrences in entire buffer
+            found_count = buffer.count(signal_pattern)
+        except socket.timeout:
+            raise RuntimeError(
+                f"Timeout waiting for {count} signals (found {found_count}). "
+                f"Last 200 bytes: {buffer[-200:]!r}"
+            )
+
+    return buffer
+
+
+@contextmanager
+def _managed_subprocess(args, timeout=SHORT_TIMEOUT):
+    """
+    Context manager for subprocess lifecycle management.
+
+    Ensures process is properly terminated and cleaned up even on exceptions.
+    Uses graceful termination first, then forceful kill if needed.
+    """
+    p = subprocess.Popen(args)
+    try:
+        yield p
+    finally:
+        try:
+            p.terminate()
+            try:
+                p.wait(timeout=timeout)
+            except subprocess.TimeoutExpired:
+                p.kill()
+                try:
+                    p.wait(timeout=timeout)
+                except subprocess.TimeoutExpired:
+                    pass  # Process refuses to die, nothing more we can do
+        except OSError:
+            pass  # Process already dead
+
+
+def _cleanup_sockets(*sockets):
+    """Safely close multiple sockets, ignoring errors."""
+    for sock in sockets:
+        if sock is not None:
+            try:
+                sock.close()
+            except OSError:
+                pass
+
+
+# ============================================================================
+# Decorators and skip conditions
+# ============================================================================
+
 skip_if_not_supported = unittest.skipIf(
     (
         sys.platform != "darwin"
@@ -65,40 +207,196 @@ def _make_test_script(script_dir, script_basename, source):
 
 def requires_subinterpreters(meth):
     """Decorator to skip a test if subinterpreters are not supported."""
-    return unittest.skipIf(interpreters is None,
-                           'subinterpreters required')(meth)
+    return unittest.skipIf(interpreters is None, "subinterpreters required")(
+        meth
+    )
 
 
+# ============================================================================
+# Simple wrapper functions for RemoteUnwinder
+# ============================================================================
+
 def get_stack_trace(pid):
-    unwinder = RemoteUnwinder(pid, all_threads=True, debug=True)
-    return unwinder.get_stack_trace()
+    for _ in busy_retry(SHORT_TIMEOUT):
+        try:
+            unwinder = RemoteUnwinder(pid, all_threads=True, debug=True)
+            return unwinder.get_stack_trace()
+        except RuntimeError as e:
+            continue
+    raise RuntimeError("Failed to get stack trace after retries")
 
 
 def get_async_stack_trace(pid):
-    unwinder = RemoteUnwinder(pid, debug=True)
-    return unwinder.get_async_stack_trace()
+    for _ in busy_retry(SHORT_TIMEOUT):
+        try:
+            unwinder = RemoteUnwinder(pid, debug=True)
+            return unwinder.get_async_stack_trace()
+        except RuntimeError as e:
+            continue
+    raise RuntimeError("Failed to get async stack trace after retries")
 
 
 def get_all_awaited_by(pid):
-    unwinder = RemoteUnwinder(pid, debug=True)
-    return unwinder.get_all_awaited_by()
+    for _ in busy_retry(SHORT_TIMEOUT):
+        try:
+            unwinder = RemoteUnwinder(pid, debug=True)
+            return unwinder.get_all_awaited_by()
+        except RuntimeError as e:
+            continue
+    raise RuntimeError("Failed to get all awaited_by after retries")
 
 
-class TestGetStackTrace(unittest.TestCase):
+# ============================================================================
+# Base test class with shared infrastructure
+# ============================================================================
+
+
+class RemoteInspectionTestBase(unittest.TestCase):
+    """Base class for remote inspection tests with common helpers."""
+
     maxDiff = None
 
+    def _run_script_and_get_trace(
+        self,
+        script,
+        trace_func,
+        wait_for_signals=None,
+        port=None,
+        backlog=1,
+    ):
+        """
+        Common pattern: run a script, wait for signals, get trace.
+
+        Args:
+            script: Script content (will be formatted with port if {port} present)
+            trace_func: Function to call with pid to get trace (e.g., get_stack_trace)
+            wait_for_signals: Signal(s) to wait for before getting trace
+            port: Port to use (auto-selected if None)
+            backlog: Socket listen backlog
+
+        Returns:
+            tuple: (trace_result, script_name)
+        """
+        if port is None:
+            port = find_unused_port()
+
+        # Format script with port if needed
+        if "{port}" in script or "{{port}}" in script:
+            script = script.replace("{{port}}", "{port}").format(port=port)
+
+        with os_helper.temp_dir() as work_dir:
+            script_dir = os.path.join(work_dir, "script_pkg")
+            os.mkdir(script_dir)
+
+            server_socket = _create_server_socket(port, backlog)
+            script_name = _make_test_script(script_dir, "script", script)
+            client_socket = None
+
+            try:
+                with _managed_subprocess([sys.executable, script_name]) as p:
+                    client_socket, _ = server_socket.accept()
+                    server_socket.close()
+                    server_socket = None
+
+                    if wait_for_signals:
+                        _wait_for_signal(client_socket, wait_for_signals)
+
+                    try:
+                        trace = trace_func(p.pid)
+                    except PermissionError:
+                        self.skipTest(
+                            "Insufficient permissions to read the stack trace"
+                        )
+                    return trace, script_name
+            finally:
+                _cleanup_sockets(client_socket, server_socket)
+
+    def _find_frame_in_trace(self, stack_trace, predicate):
+        """
+        Find a frame matching predicate in stack trace.
+
+        Args:
+            stack_trace: List of InterpreterInfo objects
+            predicate: Function(frame) -> bool
+
+        Returns:
+            FrameInfo or None
+        """
+        for interpreter_info in stack_trace:
+            for thread_info in interpreter_info.threads:
+                for frame in thread_info.frame_info:
+                    if predicate(frame):
+                        return frame
+        return None
+
+    def _find_thread_by_id(self, stack_trace, thread_id):
+        """Find a thread by its native thread ID."""
+        for interpreter_info in stack_trace:
+            for thread_info in interpreter_info.threads:
+                if thread_info.thread_id == thread_id:
+                    return thread_info
+        return None
+
+    def _find_thread_with_frame(self, stack_trace, frame_predicate):
+        """Find a thread containing a frame matching predicate."""
+        for interpreter_info in stack_trace:
+            for thread_info in interpreter_info.threads:
+                for frame in thread_info.frame_info:
+                    if frame_predicate(frame):
+                        return thread_info
+        return None
+
+    def _get_thread_statuses(self, stack_trace):
+        """Extract thread_id -> status mapping from stack trace."""
+        statuses = {}
+        for interpreter_info in stack_trace:
+            for thread_info in interpreter_info.threads:
+                statuses[thread_info.thread_id] = thread_info.status
+        return statuses
+
+    def _get_task_id_map(self, stack_trace):
+        """Create task_id -> task mapping from async stack trace."""
+        return {task.task_id: task for task in stack_trace[0].awaited_by}
+
+    def _get_awaited_by_relationships(self, stack_trace):
+        """Extract task name to awaited_by set mapping."""
+        id_to_task = self._get_task_id_map(stack_trace)
+        return {
+            task.task_name: set(
+                id_to_task[awaited.task_name].task_name
+                for awaited in task.awaited_by
+            )
+            for task in stack_trace[0].awaited_by
+        }
+
+    def _extract_coroutine_stacks(self, stack_trace):
+        """Extract and format coroutine stacks from tasks."""
+        return {
+            task.task_name: sorted(
+                tuple(tuple(frame) for frame in coro.call_stack)
+                for coro in task.coroutine_stack
+            )
+            for task in stack_trace[0].awaited_by
+        }
+
+
+# ============================================================================
+# Test classes
+# ============================================================================
+
+
+class TestGetStackTrace(RemoteInspectionTestBase):
     @skip_if_not_supported
     @unittest.skipIf(
         sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
         "Test only runs on Linux with process_vm_readv support",
     )
     def test_remote_stack_trace(self):
-        # Spawn a process with some realistic Python code
         port = find_unused_port()
         script = textwrap.dedent(
             f"""\
             import time, sys, socket, threading
-            # Connect to the test process
+
             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             sock.connect(('localhost', {port}))
 
@@ -111,80 +409,78 @@ def baz():
                 foo()
 
             def foo():
-                sock.sendall(b"ready:thread\\n"); time.sleep(10_000)  # same line number
+                sock.sendall(b"ready:thread\\n"); time.sleep(10_000)
 
             t = threading.Thread(target=bar)
             t.start()
-            sock.sendall(b"ready:main\\n"); t.join()  # same line number
+            sock.sendall(b"ready:main\\n"); t.join()
             """
         )
-        stack_trace = None
+
         with os_helper.temp_dir() as work_dir:
             script_dir = os.path.join(work_dir, "script_pkg")
             os.mkdir(script_dir)
 
-            # Create a socket server to communicate with the target process
-            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-            server_socket.bind(("localhost", port))
-            server_socket.settimeout(SHORT_TIMEOUT)
-            server_socket.listen(1)
-
+            server_socket = _create_server_socket(port)
             script_name = _make_test_script(script_dir, "script", script)
             client_socket = None
+
             try:
-                p = subprocess.Popen([sys.executable, script_name])
-                client_socket, _ = server_socket.accept()
-                server_socket.close()
-                response = b""
-                while (
-                    b"ready:main" not in response
-                    or b"ready:thread" not in response
-                ):
-                    response += client_socket.recv(1024)
-                stack_trace = get_stack_trace(p.pid)
-            except PermissionError:
-                self.skipTest(
-                    "Insufficient permissions to read the stack trace"
-                )
+                with _managed_subprocess([sys.executable, script_name]) as p:
+                    client_socket, _ = server_socket.accept()
+                    server_socket.close()
+                    server_socket = None
+
+                    _wait_for_signal(
+                        client_socket, [b"ready:main", b"ready:thread"]
+                    )
+
+                    try:
+                        stack_trace = get_stack_trace(p.pid)
+                    except PermissionError:
+                        self.skipTest(
+                            "Insufficient permissions to read the stack trace"
+                        )
+
+                    thread_expected_stack_trace = [
+                        FrameInfo([script_name, 15, "foo"]),
+                        FrameInfo([script_name, 12, "baz"]),
+                        FrameInfo([script_name, 9, "bar"]),
+                        FrameInfo([threading.__file__, ANY, "Thread.run"]),
+                        FrameInfo(
+                            [
+                                threading.__file__,
+                                ANY,
+                                "Thread._bootstrap_inner",
+                            ]
+                        ),
+                        FrameInfo(
+                            [threading.__file__, ANY, "Thread._bootstrap"]
+                        ),
+                    ]
+
+                    # Find expected thread stack
+                    found_thread = self._find_thread_with_frame(
+                        stack_trace,
+                        lambda f: f.funcname == "foo" and f.lineno == 15,
+                    )
+                    self.assertIsNotNone(
+                        found_thread, "Expected thread stack trace not found"
+                    )
+                    self.assertEqual(
+                        found_thread.frame_info, thread_expected_stack_trace
+                    )
+
+                    # Check main thread
+                    main_frame = FrameInfo([script_name, 19, "<module>"])
+                    found_main = self._find_frame_in_trace(
+                        stack_trace, lambda f: f == main_frame
+                    )
+                    self.assertIsNotNone(
+                        found_main, "Main thread stack trace not found"
+                    )
             finally:
-                if client_socket is not None:
-                    client_socket.close()
-                p.kill()
-                p.terminate()
-                p.wait(timeout=SHORT_TIMEOUT)
-
-            thread_expected_stack_trace = [
-                FrameInfo([script_name, 15, "foo"]),
-                FrameInfo([script_name, 12, "baz"]),
-                FrameInfo([script_name, 9, "bar"]),
-                FrameInfo([threading.__file__, ANY, "Thread.run"]),
-                FrameInfo([threading.__file__, ANY, "Thread._bootstrap_inner"]),
-                FrameInfo([threading.__file__, ANY, "Thread._bootstrap"]),
-            ]
-            # Is possible that there are more threads, so we check that the
-            # expected stack traces are in the result (looking at you Windows!)
-            found_expected_stack = False
-            for interpreter_info in stack_trace:
-                for thread_info in interpreter_info.threads:
-                    if thread_info.frame_info == thread_expected_stack_trace:
-                        found_expected_stack = True
-                        break
-                if found_expected_stack:
-                    break
-            self.assertTrue(found_expected_stack, "Expected thread stack trace not found")
-
-            # Check that the main thread stack trace is in the result
-            frame = FrameInfo([script_name, 19, "<module>"])
-            main_thread_found = False
-            for interpreter_info in stack_trace:
-                for thread_info in interpreter_info.threads:
-                    if frame in thread_info.frame_info:
-                        main_thread_found = True
-                        break
-                if main_thread_found:
-                    break
-            self.assertTrue(main_thread_found, "Main thread stack trace not found in result")
+                _cleanup_sockets(client_socket, server_socket)
 
     @skip_if_not_supported
     @unittest.skipIf(
@@ -192,7 +488,6 @@ def foo():
         "Test only runs on Linux with process_vm_readv support",
     )
     def test_async_remote_stack_trace(self):
-        # Spawn a process with some realistic Python code
         port = find_unused_port()
         script = textwrap.dedent(
             f"""\
@@ -200,12 +495,12 @@ def test_async_remote_stack_trace(self):
             import time
             import sys
             import socket
-            # Connect to the test process
+
             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             sock.connect(('localhost', {port}))
 
             def c5():
-                sock.sendall(b"ready"); time.sleep(10_000)  # same line number
+                sock.sendall(b"ready"); time.sleep(10_000)
 
             async def c4():
                 await asyncio.sleep(0)
@@ -236,7 +531,7 @@ def new_eager_loop():
             asyncio.run(main(), loop_factory={{TASK_FACTORY}})
             """
         )
-        stack_trace = None
+
         for task_factory_variant in "asyncio.new_event_loop", "new_eager_loop":
             with (
                 self.subTest(task_factory_variant=task_factory_variant),
@@ -244,195 +539,203 @@ def new_eager_loop():
             ):
                 script_dir = os.path.join(work_dir, "script_pkg")
                 os.mkdir(script_dir)
-                server_socket = socket.socket(
-                    socket.AF_INET, socket.SOCK_STREAM
-                )
-                server_socket.setsockopt(
-                    socket.SOL_SOCKET, socket.SO_REUSEADDR, 1
-                )
-                server_socket.bind(("localhost", port))
-                server_socket.settimeout(SHORT_TIMEOUT)
-                server_socket.listen(1)
+
+                server_socket = _create_server_socket(port)
                 script_name = _make_test_script(
                     script_dir,
                     "script",
                     script.format(TASK_FACTORY=task_factory_variant),
                 )
                 client_socket = None
+
                 try:
-                    p = subprocess.Popen([sys.executable, script_name])
-                    client_socket, _ = server_socket.accept()
-                    server_socket.close()
-                    response = client_socket.recv(1024)
-                    self.assertEqual(response, b"ready")
-                    stack_trace = get_async_stack_trace(p.pid)
-                except PermissionError:
-                    self.skipTest(
-                        "Insufficient permissions to read the stack trace"
-                    )
-                finally:
-                    if client_socket is not None:
-                        client_socket.close()
-                    p.kill()
-                    p.terminate()
-                    p.wait(timeout=SHORT_TIMEOUT)
+                    with _managed_subprocess(
+                        [sys.executable, script_name]
+                    ) as p:
+                        client_socket, _ = server_socket.accept()
+                        server_socket.close()
+                        server_socket = None
 
-                # First check all the tasks are present
-                tasks_names = [
-                    task.task_name for task in stack_trace[0].awaited_by
-                ]
-                for task_name in ["c2_root", "sub_main_1", "sub_main_2"]:
-                    self.assertIn(task_name, tasks_names)
+                        response = _wait_for_signal(client_socket, b"ready")
+                        self.assertIn(b"ready", response)
 
-                # Now ensure that the awaited_by_relationships are correct
-                id_to_task = {
-                    task.task_id: task for task in stack_trace[0].awaited_by
-                }
-                task_name_to_awaited_by = {
-                    task.task_name: set(
-                        id_to_task[awaited.task_name].task_name
-                        for awaited in task.awaited_by
-                    )
-                    for task in stack_trace[0].awaited_by
-                }
-                self.assertEqual(
-                    task_name_to_awaited_by,
-                    {
-                        "c2_root": {"Task-1", "sub_main_1", "sub_main_2"},
-                        "Task-1": set(),
-                        "sub_main_1": {"Task-1"},
-                        "sub_main_2": {"Task-1"},
-                    },
-                )
-
-                # Now ensure that the coroutine stacks are correct
-                coroutine_stacks = {
-                    task.task_name: sorted(
-                        tuple(tuple(frame) for frame in coro.call_stack)
-                        for coro in task.coroutine_stack
-                    )
-                    for task in stack_trace[0].awaited_by
-                }
-                self.assertEqual(
-                    coroutine_stacks,
-                    {
-                        "Task-1": [
-                            (
-                                tuple(
-                                    [
-                                        taskgroups.__file__,
-                                        ANY,
-                                        "TaskGroup._aexit",
-                                    ]
-                                ),
-                                tuple(
-                                    [
-                                        taskgroups.__file__,
-                                        ANY,
-                                        "TaskGroup.__aexit__",
-                                    ]
-                                ),
-                                tuple([script_name, 26, "main"]),
+                        try:
+                            stack_trace = get_async_stack_trace(p.pid)
+                        except PermissionError:
+                            self.skipTest(
+                                "Insufficient permissions to read the stack trace"
                             )
-                        ],
-                        "c2_root": [
-                            (
-                                tuple([script_name, 10, "c5"]),
-                                tuple([script_name, 14, "c4"]),
-                                tuple([script_name, 17, "c3"]),
-                                tuple([script_name, 20, "c2"]),
-                            )
-                        ],
-                        "sub_main_1": [(tuple([script_name, 23, "c1"]),)],
-                        "sub_main_2": [(tuple([script_name, 23, "c1"]),)],
-                    },
-                )
 
-                # Now ensure the coroutine stacks for the awaited_by relationships are correct.
-                awaited_by_coroutine_stacks = {
-                    task.task_name: sorted(
-                        (
-                            id_to_task[coro.task_name].task_name,
-                            tuple(tuple(frame) for frame in coro.call_stack),
+                        # Check all tasks are present
+                        tasks_names = [
+                            task.task_name
+                            for task in stack_trace[0].awaited_by
+                        ]
+                        for task_name in [
+                            "c2_root",
+                            "sub_main_1",
+                            "sub_main_2",
+                        ]:
+                            self.assertIn(task_name, tasks_names)
+
+                        # Check awaited_by relationships
+                        relationships = self._get_awaited_by_relationships(
+                            stack_trace
                         )
-                        for coro in task.awaited_by
-                    )
-                    for task in stack_trace[0].awaited_by
-                }
-                self.assertEqual(
-                    awaited_by_coroutine_stacks,
-                    {
-                        "Task-1": [],
-                        "c2_root": [
-                            (
-                                "Task-1",
+                        self.assertEqual(
+                            relationships,
+                            {
+                                "c2_root": {
+                                    "Task-1",
+                                    "sub_main_1",
+                                    "sub_main_2",
+                                },
+                                "Task-1": set(),
+                                "sub_main_1": {"Task-1"},
+                                "sub_main_2": {"Task-1"},
+                            },
+                        )
+
+                        # Check coroutine stacks
+                        coroutine_stacks = self._extract_coroutine_stacks(
+                            stack_trace
+                        )
+                        self.assertEqual(
+                            coroutine_stacks,
+                            {
+                                "Task-1": [
+                                    (
+                                        tuple(
+                                            [
+                                                taskgroups.__file__,
+                                                ANY,
+                                                "TaskGroup._aexit",
+                                            ]
+                                        ),
+                                        tuple(
+                                            [
+                                                taskgroups.__file__,
+                                                ANY,
+                                                "TaskGroup.__aexit__",
+                                            ]
+                                        ),
+                                        tuple([script_name, 26, "main"]),
+                                    )
+                                ],
+                                "c2_root": [
+                                    (
+                                        tuple([script_name, 10, "c5"]),
+                                        tuple([script_name, 14, "c4"]),
+                                        tuple([script_name, 17, "c3"]),
+                                        tuple([script_name, 20, "c2"]),
+                                    )
+                                ],
+                                "sub_main_1": [
+                                    (tuple([script_name, 23, "c1"]),)
+                                ],
+                                "sub_main_2": [
+                                    (tuple([script_name, 23, "c1"]),)
+                                ],
+                            },
+                        )
+
+                        # Check awaited_by coroutine stacks
+                        id_to_task = self._get_task_id_map(stack_trace)
+                        awaited_by_coroutine_stacks = {
+                            task.task_name: sorted(
                                 (
+                                    id_to_task[coro.task_name].task_name,
                                     tuple(
-                                        [
-                                            taskgroups.__file__,
-                                            ANY,
-                                            "TaskGroup._aexit",
-                                        ]
+                                        tuple(frame)
+                                        for frame in coro.call_stack
                                     ),
-                                    tuple(
-                                        [
-                                            taskgroups.__file__,
-                                            ANY,
-                                            "TaskGroup.__aexit__",
-                                        ]
-                                    ),
-                                    tuple([script_name, 26, "main"]),
-                                ),
-                            ),
-                            ("sub_main_1", (tuple([script_name, 23, "c1"]),)),
-                            ("sub_main_2", (tuple([script_name, 23, "c1"]),)),
-                        ],
-                        "sub_main_1": [
-                            (
-                                "Task-1",
-                                (
-                                    tuple(
-                                        [
-                                            taskgroups.__file__,
-                                            ANY,
-                                            "TaskGroup._aexit",
-                                        ]
-                                    ),
-                                    tuple(
-                                        [
-                                            taskgroups.__file__,
-                                            ANY,
-                                            "TaskGroup.__aexit__",
-                                        ]
-                                    ),
-                                    tuple([script_name, 26, "main"]),
-                                ),
+                                )
+                                for coro in task.awaited_by
                             )
-                        ],
-                        "sub_main_2": [
-                            (
-                                "Task-1",
-                                (
-                                    tuple(
-                                        [
-                                            taskgroups.__file__,
-                                            ANY,
-                                            "TaskGroup._aexit",
-                                        ]
+                            for task in stack_trace[0].awaited_by
+                        }
+                        self.assertEqual(
+                            awaited_by_coroutine_stacks,
+                            {
+                                "Task-1": [],
+                                "c2_root": [
+                                    (
+                                        "Task-1",
+                                        (
+                                            tuple(
+                                                [
+                                                    taskgroups.__file__,
+                                                    ANY,
+                                                    "TaskGroup._aexit",
+                                                ]
+                                            ),
+                                            tuple(
+                                                [
+                                                    taskgroups.__file__,
+                                                    ANY,
+                                                    "TaskGroup.__aexit__",
+                                                ]
+                                            ),
+                                            tuple([script_name, 26, "main"]),
+                                        ),
                                     ),
-                                    tuple(
-                                        [
-                                            taskgroups.__file__,
-                                            ANY,
-                                            "TaskGroup.__aexit__",
-                                        ]
+                                    (
+                                        "sub_main_1",
+                                        (tuple([script_name, 23, "c1"]),),
                                     ),
-                                    tuple([script_name, 26, "main"]),
-                                ),
-                            )
-                        ],
-                    },
-                )
+                                    (
+                                        "sub_main_2",
+                                        (tuple([script_name, 23, "c1"]),),
+                                    ),
+                                ],
+                                "sub_main_1": [
+                                    (
+                                        "Task-1",
+                                        (
+                                            tuple(
+                                                [
+                                                    taskgroups.__file__,
+                                                    ANY,
+                                                    "TaskGroup._aexit",
+                                                ]
+                                            ),
+                                            tuple(
+                                                [
+                                                    taskgroups.__file__,
+                                                    ANY,
+                                                    "TaskGroup.__aexit__",
+                                                ]
+                                            ),
+                                            tuple([script_name, 26, "main"]),
+                                        ),
+                                    )
+                                ],
+                                "sub_main_2": [
+                                    (
+                                        "Task-1",
+                                        (
+                                            tuple(
+                                                [
+                                                    taskgroups.__file__,
+                                                    ANY,
+                                                    "TaskGroup._aexit",
+                                                ]
+                                            ),
+                                            tuple(
+                                                [
+                                                    taskgroups.__file__,
+                                                    ANY,
+                                                    "TaskGroup.__aexit__",
+                                                ]
+                                            ),
+                                            tuple([script_name, 26, "main"]),
+                                        ),
+                                    )
+                                ],
+                            },
+                        )
+                finally:
+                    _cleanup_sockets(client_socket, server_socket)
 
     @skip_if_not_supported
     @unittest.skipIf(
@@ -440,7 +743,6 @@ def new_eager_loop():
         "Test only runs on Linux with process_vm_readv support",
     )
     def test_asyncgen_remote_stack_trace(self):
-        # Spawn a process with some realistic Python code
         port = find_unused_port()
         script = textwrap.dedent(
             f"""\
@@ -448,12 +750,12 @@ def test_asyncgen_remote_stack_trace(self):
             import time
             import sys
             import socket
-            # Connect to the test process
+
             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             sock.connect(('localhost', {port}))
 
             async def gen_nested_call():
-                sock.sendall(b"ready"); time.sleep(10_000)  # same line number
+                sock.sendall(b"ready"); time.sleep(10_000)
 
             async def gen():
                 for num in range(2):
@@ -468,59 +770,56 @@ async def main():
             asyncio.run(main())
             """
         )
-        stack_trace = None
+
         with os_helper.temp_dir() as work_dir:
             script_dir = os.path.join(work_dir, "script_pkg")
             os.mkdir(script_dir)
-            # Create a socket server to communicate with the target process
-            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-            server_socket.bind(("localhost", port))
-            server_socket.settimeout(SHORT_TIMEOUT)
-            server_socket.listen(1)
+
+            server_socket = _create_server_socket(port)
             script_name = _make_test_script(script_dir, "script", script)
             client_socket = None
+
             try:
-                p = subprocess.Popen([sys.executable, script_name])
-                client_socket, _ = server_socket.accept()
-                server_socket.close()
-                response = client_socket.recv(1024)
-                self.assertEqual(response, b"ready")
-                stack_trace = get_async_stack_trace(p.pid)
-            except PermissionError:
-                self.skipTest(
-                    "Insufficient permissions to read the stack trace"
-                )
-            finally:
-                if client_socket is not None:
-                    client_socket.close()
-                p.kill()
-                p.terminate()
-                p.wait(timeout=SHORT_TIMEOUT)
+                with _managed_subprocess([sys.executable, script_name]) as p:
+                    client_socket, _ = server_socket.accept()
+                    server_socket.close()
+                    server_socket = None
 
-            # For this simple asyncgen test, we only expect one task with the full coroutine stack
-            self.assertEqual(len(stack_trace[0].awaited_by), 1)
-            task = stack_trace[0].awaited_by[0]
-            self.assertEqual(task.task_name, "Task-1")
+                    response = _wait_for_signal(client_socket, b"ready")
+                    self.assertIn(b"ready", response)
 
-            # Check the coroutine stack - based on actual output, only shows main
-            coroutine_stack = sorted(
-                tuple(tuple(frame) for frame in coro.call_stack)
-                for coro in task.coroutine_stack
-            )
-            self.assertEqual(
-                coroutine_stack,
-                [
-                    (
-                        tuple([script_name, 10, "gen_nested_call"]),
-                        tuple([script_name, 16, "gen"]),
-                        tuple([script_name, 19, "main"]),
+                    try:
+                        stack_trace = get_async_stack_trace(p.pid)
+                    except PermissionError:
+                        self.skipTest(
+                            "Insufficient permissions to read the stack trace"
+                        )
+
+                    # For this simple asyncgen test, we only expect one task
+                    self.assertEqual(len(stack_trace[0].awaited_by), 1)
+                    task = stack_trace[0].awaited_by[0]
+                    self.assertEqual(task.task_name, "Task-1")
+
+                    # Check the coroutine stack
+                    coroutine_stack = sorted(
+                        tuple(tuple(frame) for frame in coro.call_stack)
+                        for coro in task.coroutine_stack
+                    )
+                    self.assertEqual(
+                        coroutine_stack,
+                        [
+                            (
+                                tuple([script_name, 10, "gen_nested_call"]),
+                                tuple([script_name, 16, "gen"]),
+                                tuple([script_name, 19, "main"]),
+                            )
+                        ],
                     )
-                ],
-            )
 
-            # No awaited_by relationships expected for this simple case
-            self.assertEqual(task.awaited_by, [])
+                    # No awaited_by relationships expected
+                    self.assertEqual(task.awaited_by, [])
+            finally:
+                _cleanup_sockets(client_socket, server_socket)
 
     @skip_if_not_supported
     @unittest.skipIf(
@@ -528,7 +827,6 @@ async def main():
         "Test only runs on Linux with process_vm_readv support",
     )
     def test_async_gather_remote_stack_trace(self):
-        # Spawn a process with some realistic Python code
         port = find_unused_port()
         script = textwrap.dedent(
             f"""\
@@ -536,13 +834,13 @@ def test_async_gather_remote_stack_trace(self):
             import time
             import sys
             import socket
-            # Connect to the test process
+
             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             sock.connect(('localhost', {port}))
 
             async def deep():
                 await asyncio.sleep(0)
-                sock.sendall(b"ready"); time.sleep(10_000)  # same line number
+                sock.sendall(b"ready"); time.sleep(10_000)
 
             async def c1():
                 await asyncio.sleep(0)
@@ -557,103 +855,92 @@ async def main():
             asyncio.run(main())
             """
         )
-        stack_trace = None
+
         with os_helper.temp_dir() as work_dir:
             script_dir = os.path.join(work_dir, "script_pkg")
             os.mkdir(script_dir)
-            # Create a socket server to communicate with the target process
-            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-            server_socket.bind(("localhost", port))
-            server_socket.settimeout(SHORT_TIMEOUT)
-            server_socket.listen(1)
+
+            server_socket = _create_server_socket(port)
             script_name = _make_test_script(script_dir, "script", script)
             client_socket = None
+
             try:
-                p = subprocess.Popen([sys.executable, script_name])
-                client_socket, _ = server_socket.accept()
-                server_socket.close()
-                response = client_socket.recv(1024)
-                self.assertEqual(response, b"ready")
-                stack_trace = get_async_stack_trace(p.pid)
-            except PermissionError:
-                self.skipTest(
-                    "Insufficient permissions to read the stack trace"
-                )
-            finally:
-                if client_socket is not None:
-                    client_socket.close()
-                p.kill()
-                p.terminate()
-                p.wait(timeout=SHORT_TIMEOUT)
+                with _managed_subprocess([sys.executable, script_name]) as p:
+                    client_socket, _ = server_socket.accept()
+                    server_socket.close()
+                    server_socket = None
 
-            # First check all the tasks are present
-            tasks_names = [
-                task.task_name for task in stack_trace[0].awaited_by
-            ]
-            for task_name in ["Task-1", "Task-2"]:
-                self.assertIn(task_name, tasks_names)
+                    response = _wait_for_signal(client_socket, b"ready")
+                    self.assertIn(b"ready", response)
 
-            # Now ensure that the awaited_by_relationships are correct
-            id_to_task = {
-                task.task_id: task for task in stack_trace[0].awaited_by
-            }
-            task_name_to_awaited_by = {
-                task.task_name: set(
-                    id_to_task[awaited.task_name].task_name
-                    for awaited in task.awaited_by
-                )
-                for task in stack_trace[0].awaited_by
-            }
-            self.assertEqual(
-                task_name_to_awaited_by,
-                {
-                    "Task-1": set(),
-                    "Task-2": {"Task-1"},
-                },
-            )
-
-            # Now ensure that the coroutine stacks are correct
-            coroutine_stacks = {
-                task.task_name: sorted(
-                    tuple(tuple(frame) for frame in coro.call_stack)
-                    for coro in task.coroutine_stack
-                )
-                for task in stack_trace[0].awaited_by
-            }
-            self.assertEqual(
-                coroutine_stacks,
-                {
-                    "Task-1": [(tuple([script_name, 21, "main"]),)],
-                    "Task-2": [
-                        (
-                            tuple([script_name, 11, "deep"]),
-                            tuple([script_name, 15, "c1"]),
+                    try:
+                        stack_trace = get_async_stack_trace(p.pid)
+                    except PermissionError:
+                        self.skipTest(
+                            "Insufficient permissions to read the stack trace"
                         )
-                    ],
-                },
-            )
 
-            # Now ensure the coroutine stacks for the awaited_by relationships are correct.
-            awaited_by_coroutine_stacks = {
-                task.task_name: sorted(
-                    (
-                        id_to_task[coro.task_name].task_name,
-                        tuple(tuple(frame) for frame in coro.call_stack),
+                    # Check all tasks are present
+                    tasks_names = [
+                        task.task_name for task in stack_trace[0].awaited_by
+                    ]
+                    for task_name in ["Task-1", "Task-2"]:
+                        self.assertIn(task_name, tasks_names)
+
+                    # Check awaited_by relationships
+                    relationships = self._get_awaited_by_relationships(
+                        stack_trace
                     )
-                    for coro in task.awaited_by
-                )
-                for task in stack_trace[0].awaited_by
-            }
-            self.assertEqual(
-                awaited_by_coroutine_stacks,
-                {
-                    "Task-1": [],
-                    "Task-2": [
-                        ("Task-1", (tuple([script_name, 21, "main"]),))
-                    ],
-                },
-            )
+                    self.assertEqual(
+                        relationships,
+                        {
+                            "Task-1": set(),
+                            "Task-2": {"Task-1"},
+                        },
+                    )
+
+                    # Check coroutine stacks
+                    coroutine_stacks = self._extract_coroutine_stacks(
+                        stack_trace
+                    )
+                    self.assertEqual(
+                        coroutine_stacks,
+                        {
+                            "Task-1": [(tuple([script_name, 21, "main"]),)],
+                            "Task-2": [
+                                (
+                                    tuple([script_name, 11, "deep"]),
+                                    tuple([script_name, 15, "c1"]),
+                                )
+                            ],
+                        },
+                    )
+
+                    # Check awaited_by coroutine stacks
+                    id_to_task = self._get_task_id_map(stack_trace)
+                    awaited_by_coroutine_stacks = {
+                        task.task_name: sorted(
+                            (
+                                id_to_task[coro.task_name].task_name,
+                                tuple(
+                                    tuple(frame) for frame in coro.call_stack
+                                ),
+                            )
+                            for coro in task.awaited_by
+                        )
+                        for task in stack_trace[0].awaited_by
+                    }
+                    self.assertEqual(
+                        awaited_by_coroutine_stacks,
+                        {
+                            "Task-1": [],
+                            "Task-2": [
+                                ("Task-1", (tuple([script_name, 21, "main"]),))
+                            ],
+                        },
+                    )
+            finally:
+                _cleanup_sockets(client_socket, server_socket)
 
     @skip_if_not_supported
     @unittest.skipIf(
@@ -661,7 +948,6 @@ async def main():
         "Test only runs on Linux with process_vm_readv support",
     )
     def test_async_staggered_race_remote_stack_trace(self):
-        # Spawn a process with some realistic Python code
         port = find_unused_port()
         script = textwrap.dedent(
             f"""\
@@ -669,13 +955,13 @@ def test_async_staggered_race_remote_stack_trace(self):
             import time
             import sys
             import socket
-            # Connect to the test process
+
             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             sock.connect(('localhost', {port}))
 
             async def deep():
                 await asyncio.sleep(0)
-                sock.sendall(b"ready"); time.sleep(10_000)  # same line number
+                sock.sendall(b"ready"); time.sleep(10_000)
 
             async def c1():
                 await asyncio.sleep(0)
@@ -693,123 +979,122 @@ async def main():
             asyncio.run(main())
             """
         )
-        stack_trace = None
+
         with os_helper.temp_dir() as work_dir:
             script_dir = os.path.join(work_dir, "script_pkg")
             os.mkdir(script_dir)
-            # Create a socket server to communicate with the target process
-            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-            server_socket.bind(("localhost", port))
-            server_socket.settimeout(SHORT_TIMEOUT)
-            server_socket.listen(1)
+
+            server_socket = _create_server_socket(port)
             script_name = _make_test_script(script_dir, "script", script)
             client_socket = None
+
             try:
-                p = subprocess.Popen([sys.executable, script_name])
-                client_socket, _ = server_socket.accept()
-                server_socket.close()
-                response = client_socket.recv(1024)
-                self.assertEqual(response, b"ready")
-                stack_trace = get_async_stack_trace(p.pid)
-            except PermissionError:
-                self.skipTest(
-                    "Insufficient permissions to read the stack trace"
-                )
-            finally:
-                if client_socket is not None:
-                    client_socket.close()
-                p.kill()
-                p.terminate()
-                p.wait(timeout=SHORT_TIMEOUT)
+                with _managed_subprocess([sys.executable, script_name]) as p:
+                    client_socket, _ = server_socket.accept()
+                    server_socket.close()
+                    server_socket = None
 
-            # First check all the tasks are present
-            tasks_names = [
-                task.task_name for task in stack_trace[0].awaited_by
-            ]
-            for task_name in ["Task-1", "Task-2"]:
-                self.assertIn(task_name, tasks_names)
+                    response = _wait_for_signal(client_socket, b"ready")
+                    self.assertIn(b"ready", response)
 
-            # Now ensure that the awaited_by_relationships are correct
-            id_to_task = {
-                task.task_id: task for task in stack_trace[0].awaited_by
-            }
-            task_name_to_awaited_by = {
-                task.task_name: set(
-                    id_to_task[awaited.task_name].task_name
-                    for awaited in task.awaited_by
-                )
-                for task in stack_trace[0].awaited_by
-            }
-            self.assertEqual(
-                task_name_to_awaited_by,
-                {
-                    "Task-1": set(),
-                    "Task-2": {"Task-1"},
-                },
-            )
-
-            # Now ensure that the coroutine stacks are correct
-            coroutine_stacks = {
-                task.task_name: sorted(
-                    tuple(tuple(frame) for frame in coro.call_stack)
-                    for coro in task.coroutine_stack
-                )
-                for task in stack_trace[0].awaited_by
-            }
-            self.assertEqual(
-                coroutine_stacks,
-                {
-                    "Task-1": [
-                        (
-                            tuple([staggered.__file__, ANY, "staggered_race"]),
-                            tuple([script_name, 21, "main"]),
+                    try:
+                        stack_trace = get_async_stack_trace(p.pid)
+                    except PermissionError:
+                        self.skipTest(
+                            "Insufficient permissions to read the stack trace"
                         )
-                    ],
-                    "Task-2": [
-                        (
-                            tuple([script_name, 11, "deep"]),
-                            tuple([script_name, 15, "c1"]),
-                            tuple(
-                                [
-                                    staggered.__file__,
-                                    ANY,
-                                    "staggered_race.<locals>.run_one_coro",
-                                ]
-                            ),
-                        )
-                    ],
-                },
-            )
 
-            # Now ensure the coroutine stacks for the awaited_by relationships are correct.
-            awaited_by_coroutine_stacks = {
-                task.task_name: sorted(
-                    (
-                        id_to_task[coro.task_name].task_name,
-                        tuple(tuple(frame) for frame in coro.call_stack),
+                    # Check all tasks are present
+                    tasks_names = [
+                        task.task_name for task in stack_trace[0].awaited_by
+                    ]
+                    for task_name in ["Task-1", "Task-2"]:
+                        self.assertIn(task_name, tasks_names)
+
+                    # Check awaited_by relationships
+                    relationships = self._get_awaited_by_relationships(
+                        stack_trace
                     )
-                    for coro in task.awaited_by
-                )
-                for task in stack_trace[0].awaited_by
-            }
-            self.assertEqual(
-                awaited_by_coroutine_stacks,
-                {
-                    "Task-1": [],
-                    "Task-2": [
-                        (
-                            "Task-1",
+                    self.assertEqual(
+                        relationships,
+                        {
+                            "Task-1": set(),
+                            "Task-2": {"Task-1"},
+                        },
+                    )
+
+                    # Check coroutine stacks
+                    coroutine_stacks = self._extract_coroutine_stacks(
+                        stack_trace
+                    )
+                    self.assertEqual(
+                        coroutine_stacks,
+                        {
+                            "Task-1": [
+                                (
+                                    tuple(
+                                        [
+                                            staggered.__file__,
+                                            ANY,
+                                            "staggered_race",
+                                        ]
+                                    ),
+                                    tuple([script_name, 21, "main"]),
+                                )
+                            ],
+                            "Task-2": [
+                                (
+                                    tuple([script_name, 11, "deep"]),
+                                    tuple([script_name, 15, "c1"]),
+                                    tuple(
+                                        [
+                                            staggered.__file__,
+                                            ANY,
+                                            "staggered_race.<locals>.run_one_coro",
+                                        ]
+                                    ),
+                                )
+                            ],
+                        },
+                    )
+
+                    # Check awaited_by coroutine stacks
+                    id_to_task = self._get_task_id_map(stack_trace)
+                    awaited_by_coroutine_stacks = {
+                        task.task_name: sorted(
                             (
+                                id_to_task[coro.task_name].task_name,
                                 tuple(
-                                    [staggered.__file__, ANY, "staggered_race"]
+                                    tuple(frame) for frame in coro.call_stack
                                 ),
-                                tuple([script_name, 21, "main"]),
-                            ),
+                            )
+                            for coro in task.awaited_by
                         )
-                    ],
-                },
-            )
+                        for task in stack_trace[0].awaited_by
+                    }
+                    self.assertEqual(
+                        awaited_by_coroutine_stacks,
+                        {
+                            "Task-1": [],
+                            "Task-2": [
+                                (
+                                    "Task-1",
+                                    (
+                                        tuple(
+                                            [
+                                                staggered.__file__,
+                                                ANY,
+                                                "staggered_race",
+                                            ]
+                                        ),
+                                        tuple([script_name, 21, "main"]),
+                                    ),
+                                )
+                            ],
+                        },
+                    )
+            finally:
+                _cleanup_sockets(client_socket, server_socket)
 
     @skip_if_not_supported
     @unittest.skipIf(
@@ -817,6 +1102,10 @@ async def main():
         "Test only runs on Linux with process_vm_readv support",
     )
     def test_async_global_awaited_by(self):
+        # Reduced from 1000 to 100 to avoid file descriptor exhaustion
+        # when running tests in parallel (e.g., -j 20)
+        NUM_TASKS = 100
+
         port = find_unused_port()
         script = textwrap.dedent(
             f"""\
@@ -832,7 +1121,6 @@ def test_async_global_awaited_by(self):
             PORT = socket_helper.find_unused_port()
             connections = 0
 
-            # Connect to the test process
             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             sock.connect(('localhost', {port}))
 
@@ -855,23 +1143,16 @@ async def echo_client(message):
                 assert message == data.decode()
                 writer.close()
                 await writer.wait_closed()
-                # Signal we are ready to sleep
                 sock.sendall(b"ready")
                 await asyncio.sleep(SHORT_TIMEOUT)
 
             async def echo_client_spam(server):
                 async with asyncio.TaskGroup() as tg:
-                    while connections < 1000:
+                    while connections < {NUM_TASKS}:
                         msg = list(ascii_lowercase + digits)
                         random.shuffle(msg)
                         tg.create_task(echo_client("".join(msg)))
                         await asyncio.sleep(0)
-                    # at least a 1000 tasks created. Each task will signal
-                    # when is ready to avoid the race caused by the fact that
-                    # tasks are waited on tg.__exit__ and we cannot signal when
-                    # that happens otherwise
-                # at this point all client tasks completed without assertion errors
-                # let's wrap up the test
                 server.close()
                 await server.wait_closed()
 
@@ -886,231 +1167,216 @@ async def main():
             asyncio.run(main())
             """
         )
-        stack_trace = None
+
         with os_helper.temp_dir() as work_dir:
             script_dir = os.path.join(work_dir, "script_pkg")
             os.mkdir(script_dir)
-            # Create a socket server to communicate with the target process
-            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-            server_socket.bind(("localhost", port))
-            server_socket.settimeout(SHORT_TIMEOUT)
-            server_socket.listen(1)
+
+            server_socket = _create_server_socket(port)
             script_name = _make_test_script(script_dir, "script", script)
             client_socket = None
+
             try:
-                p = subprocess.Popen([sys.executable, script_name])
-                client_socket, _ = server_socket.accept()
-                server_socket.close()
-                for _ in range(1000):
-                    expected_response = b"ready"
-                    response = client_socket.recv(len(expected_response))
-                    self.assertEqual(response, expected_response)
-                for _ in busy_retry(SHORT_TIMEOUT):
+                with _managed_subprocess([sys.executable, script_name]) as p:
+                    client_socket, _ = server_socket.accept()
+                    server_socket.close()
+                    server_socket = None
+
+                    # Wait for NUM_TASKS "ready" signals
+                    try:
+                        _wait_for_n_signals(client_socket, b"ready", NUM_TASKS)
+                    except RuntimeError as e:
+                        self.fail(str(e))
+
                     try:
                         all_awaited_by = get_all_awaited_by(p.pid)
-                    except RuntimeError as re:
-                        # This call reads a linked list in another process with
-                        # no synchronization. That occasionally leads to invalid
-                        # reads. Here we avoid making the test flaky.
-                        msg = str(re)
-                        if msg.startswith("Task list appears corrupted"):
-                            continue
-                        elif msg.startswith(
-                            "Invalid linked list structure reading remote memory"
-                        ):
-                            continue
-                        elif msg.startswith("Unknown error reading memory"):
-                            continue
-                        elif msg.startswith("Unhandled frame owner"):
-                            continue
-                        raise  # Unrecognized exception, safest not to ignore it
-                    else:
-                        break
-                # expected: a list of two elements: 1 thread, 1 interp
-                self.assertEqual(len(all_awaited_by), 2)
-                # expected: a tuple with the thread ID and the awaited_by list
-                self.assertEqual(len(all_awaited_by[0]), 2)
-                # expected: no tasks in the fallback per-interp task list
-                self.assertEqual(all_awaited_by[1], (0, []))
-                entries = all_awaited_by[0][1]
-                # expected: at least 1000 pending tasks
-                self.assertGreaterEqual(len(entries), 1000)
-                # the first three tasks stem from the code structure
-                main_stack = [
-                    FrameInfo([taskgroups.__file__, ANY, "TaskGroup._aexit"]),
-                    FrameInfo(
-                        [taskgroups.__file__, ANY, "TaskGroup.__aexit__"]
-                    ),
-                    FrameInfo([script_name, 60, "main"]),
-                ]
-                self.assertIn(
-                    TaskInfo(
-                        [ANY, "Task-1", [CoroInfo([main_stack, ANY])], []]
-                    ),
-                    entries,
-                )
-                self.assertIn(
-                    TaskInfo(
-                        [
-                            ANY,
-                            "server task",
-                            [
-                                CoroInfo(
-                                    [
-                                        [
-                                            FrameInfo(
-                                                [
-                                                    base_events.__file__,
-                                                    ANY,
-                                                    "Server.serve_forever",
-                                                ]
-                                            )
-                                        ],
-                                        ANY,
-                                    ]
-                                )
-                            ],
-                            [
-                                CoroInfo(
-                                    [
-                                        [
-                                            FrameInfo(
-                                                [
-                                                    taskgroups.__file__,
-                                                    ANY,
-                                                    "TaskGroup._aexit",
-                                                ]
-                                            ),
-                                            FrameInfo(
-                                                [
-                                                    taskgroups.__file__,
-                                                    ANY,
-                                                    "TaskGroup.__aexit__",
-                                                ]
-                                            ),
-                                            FrameInfo(
-                                                [script_name, ANY, "main"]
-                                            ),
-                                        ],
-                                        ANY,
-                                    ]
-                                )
-                            ],
-                        ]
-                    ),
-                    entries,
-                )
-                self.assertIn(
-                    TaskInfo(
-                        [
-                            ANY,
-                            "Task-4",
-                            [
-                                CoroInfo(
-                                    [
-                                        [
-                                            FrameInfo(
-                                                [tasks.__file__, ANY, "sleep"]
-                                            ),
-                                            FrameInfo(
-                                                [
-                                                    script_name,
-                                                    38,
-                                                    "echo_client",
-                                                ]
-                                            ),
-                                        ],
-                                        ANY,
-                                    ]
-                                )
-                            ],
-                            [
-                                CoroInfo(
-                                    [
-                                        [
-                                            FrameInfo(
-                                                [
-                                                    taskgroups.__file__,
-                                                    ANY,
-                                                    "TaskGroup._aexit",
-                                                ]
-                                            ),
-                                            FrameInfo(
-                                                [
-                                                    taskgroups.__file__,
-                                                    ANY,
-                                                    "TaskGroup.__aexit__",
-                                                ]
-                                            ),
-                                            FrameInfo(
-                                                [
-                                                    script_name,
-                                                    41,
-                                                    "echo_client_spam",
-                                                ]
-                                            ),
-                                        ],
-                                        ANY,
-                                    ]
-                                )
-                            ],
-                        ]
-                    ),
-                    entries,
-                )
+                    except PermissionError:
+                        self.skipTest(
+                            "Insufficient permissions to read the stack trace"
+                        )
 
-                expected_awaited_by = [
-                    CoroInfo(
-                        [
-                            [
-                                FrameInfo(
-                                    [
-                                        taskgroups.__file__,
-                                        ANY,
-                                        "TaskGroup._aexit",
-                                    ]
-                                ),
-                                FrameInfo(
-                                    [
-                                        taskgroups.__file__,
-                                        ANY,
-                                        "TaskGroup.__aexit__",
-                                    ]
-                                ),
-                                FrameInfo(
-                                    [script_name, 41, "echo_client_spam"]
-                                ),
-                            ],
-                            ANY,
-                        ]
-                    )
-                ]
-                tasks_with_awaited = [
-                    task
-                    for task in entries
-                    if task.awaited_by == expected_awaited_by
-                ]
-                self.assertGreaterEqual(len(tasks_with_awaited), 1000)
+                    # Expected: a list of two elements: 1 thread, 1 interp
+                    self.assertEqual(len(all_awaited_by), 2)
+                    # Expected: a tuple with the thread ID and the awaited_by list
+                    self.assertEqual(len(all_awaited_by[0]), 2)
+                    # Expected: no tasks in the fallback per-interp task list
+                    self.assertEqual(all_awaited_by[1], (0, []))
 
-                # the final task will have some random number, but it should for
-                # sure be one of the echo client spam horde (In windows this is not true
-                # for some reason)
-                if sys.platform != "win32":
-                    self.assertEqual(
-                        tasks_with_awaited[-1].awaited_by,
-                        entries[-1].awaited_by,
+                    entries = all_awaited_by[0][1]
+                    # Expected: at least NUM_TASKS pending tasks
+                    self.assertGreaterEqual(len(entries), NUM_TASKS)
+
+                    # Check the main task structure
+                    main_stack = [
+                        FrameInfo(
+                            [taskgroups.__file__, ANY, "TaskGroup._aexit"]
+                        ),
+                        FrameInfo(
+                            [taskgroups.__file__, ANY, "TaskGroup.__aexit__"]
+                        ),
+                        FrameInfo([script_name, 52, "main"]),
+                    ]
+                    self.assertIn(
+                        TaskInfo(
+                            [ANY, "Task-1", [CoroInfo([main_stack, ANY])], []]
+                        ),
+                        entries,
                     )
-            except PermissionError:
-                self.skipTest(
-                    "Insufficient permissions to read the stack trace"
-                )
+                    self.assertIn(
+                        TaskInfo(
+                            [
+                                ANY,
+                                "server task",
+                                [
+                                    CoroInfo(
+                                        [
+                                            [
+                                                FrameInfo(
+                                                    [
+                                                        base_events.__file__,
+                                                        ANY,
+                                                        "Server.serve_forever",
+                                                    ]
+                                                )
+                                            ],
+                                            ANY,
+                                        ]
+                                    )
+                                ],
+                                [
+                                    CoroInfo(
+                                        [
+                                            [
+                                                FrameInfo(
+                                                    [
+                                                        taskgroups.__file__,
+                                                        ANY,
+                                                        "TaskGroup._aexit",
+                                                    ]
+                                                ),
+                                                FrameInfo(
+                                                    [
+                                                        taskgroups.__file__,
+                                                        ANY,
+                                                        "TaskGroup.__aexit__",
+                                                    ]
+                                                ),
+                                                FrameInfo(
+                                                    [script_name, ANY, "main"]
+                                                ),
+                                            ],
+                                            ANY,
+                                        ]
+                                    )
+                                ],
+                            ]
+                        ),
+                        entries,
+                    )
+                    self.assertIn(
+                        TaskInfo(
+                            [
+                                ANY,
+                                "Task-4",
+                                [
+                                    CoroInfo(
+                                        [
+                                            [
+                                                FrameInfo(
+                                                    [
+                                                        tasks.__file__,
+                                                        ANY,
+                                                        "sleep",
+                                                    ]
+                                                ),
+                                                FrameInfo(
+                                                    [
+                                                        script_name,
+                                                        36,
+                                                        "echo_client",
+                                                    ]
+                                                ),
+                                            ],
+                                            ANY,
+                                        ]
+                                    )
+                                ],
+                                [
+                                    CoroInfo(
+                                        [
+                                            [
+                                                FrameInfo(
+                                                    [
+                                                        taskgroups.__file__,
+                                                        ANY,
+                                                        "TaskGroup._aexit",
+                                                    ]
+                                                ),
+                                                FrameInfo(
+                                                    [
+                                                        taskgroups.__file__,
+                                                        ANY,
+                                                        "TaskGroup.__aexit__",
+                                                    ]
+                                                ),
+                                                FrameInfo(
+                                                    [
+                                                        script_name,
+                                                        39,
+                                                        "echo_client_spam",
+                                                    ]
+                                                ),
+                                            ],
+                                            ANY,
+                                        ]
+                                    )
+                                ],
+                            ]
+                        ),
+                        entries,
+                    )
+
+                    expected_awaited_by = [
+                        CoroInfo(
+                            [
+                                [
+                                    FrameInfo(
+                                        [
+                                            taskgroups.__file__,
+                                            ANY,
+                                            "TaskGroup._aexit",
+                                        ]
+                                    ),
+                                    FrameInfo(
+                                        [
+                                            taskgroups.__file__,
+                                            ANY,
+                                            "TaskGroup.__aexit__",
+                                        ]
+                                    ),
+                                    FrameInfo(
+                                        [script_name, 39, "echo_client_spam"]
+                                    ),
+                                ],
+                                ANY,
+                            ]
+                        )
+                    ]
+                    tasks_with_awaited = [
+                        task
+                        for task in entries
+                        if task.awaited_by == expected_awaited_by
+                    ]
+                    self.assertGreaterEqual(len(tasks_with_awaited), NUM_TASKS)
+
+                    # Final task should be from echo client spam (not on Windows)
+                    if sys.platform != "win32":
+                        self.assertEqual(
+                            tasks_with_awaited[-1].awaited_by,
+                            entries[-1].awaited_by,
+                        )
             finally:
-                if client_socket is not None:
-                    client_socket.close()
-                p.kill()
-                p.terminate()
-                p.wait(timeout=SHORT_TIMEOUT)
+                _cleanup_sockets(client_socket, server_socket)
 
     @skip_if_not_supported
     @unittest.skipIf(
@@ -1119,25 +1385,24 @@ async def main():
     )
     def test_self_trace(self):
         stack_trace = get_stack_trace(os.getpid())
-        # Is possible that there are more threads, so we check that the
-        # expected stack traces are in the result (looking at you Windows!)
-        this_tread_stack = None
-        # New format: [InterpreterInfo(interpreter_id, [ThreadInfo(...)])]
+
+        this_thread_stack = None
         for interpreter_info in stack_trace:
             for thread_info in interpreter_info.threads:
                 if thread_info.thread_id == threading.get_native_id():
-                    this_tread_stack = thread_info.frame_info
+                    this_thread_stack = thread_info.frame_info
                     break
-            if this_tread_stack:
+            if this_thread_stack:
                 break
-        self.assertIsNotNone(this_tread_stack)
+
+        self.assertIsNotNone(this_thread_stack)
         self.assertEqual(
-            this_tread_stack[:2],
+            this_thread_stack[:2],
             [
                 FrameInfo(
                     [
                         __file__,
-                        get_stack_trace.__code__.co_firstlineno + 2,
+                        get_stack_trace.__code__.co_firstlineno + 4,
                         "get_stack_trace",
                     ]
                 ),
@@ -1158,12 +1423,11 @@ def test_self_trace(self):
     )
     @requires_subinterpreters
     def test_subinterpreter_stack_trace(self):
-        # Test that subinterpreters are correctly handled
         port = find_unused_port()
 
-        # Calculate subinterpreter code separately and pickle it to avoid f-string issues
         import pickle
-        subinterp_code = textwrap.dedent(f'''
+
+        subinterp_code = textwrap.dedent(f"""
             import socket
             import time
 
@@ -1176,9 +1440,8 @@ def nested_func():
                 nested_func()
 
             sub_worker()
-        ''').strip()
+        """).strip()
 
-        # Pickle the subinterpreter code
         pickled_code = pickle.dumps(subinterp_code)
 
         script = textwrap.dedent(
@@ -1189,33 +1452,26 @@ def nested_func():
             import socket
             import threading
 
-            # Connect to the test process
             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             sock.connect(('localhost', {port}))
 
             def main_worker():
-                # Function running in main interpreter
                 sock.sendall(b"ready:main\\n")
                 time.sleep(10_000)
 
             def run_subinterp():
-                # Create and run subinterpreter
                 subinterp = interpreters.create()
-
                 import pickle
                 pickled_code = {pickled_code!r}
                 subinterp_code = pickle.loads(pickled_code)
                 subinterp.exec(subinterp_code)
 
-            # Start subinterpreter in thread
             sub_thread = threading.Thread(target=run_subinterp)
             sub_thread.start()
 
-            # Start main thread work
             main_thread = threading.Thread(target=main_worker)
             main_thread.start()
 
-            # Keep main thread alive
             main_thread.join()
             sub_thread.join()
             """
@@ -1225,85 +1481,74 @@ def run_subinterp():
             script_dir = os.path.join(work_dir, "script_pkg")
             os.mkdir(script_dir)
 
-            # Create a socket server to communicate with the target process
-            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-            server_socket.bind(("localhost", port))
-            server_socket.settimeout(SHORT_TIMEOUT)
-            server_socket.listen(1)
-
+            server_socket = _create_server_socket(port)
             script_name = _make_test_script(script_dir, "script", script)
             client_sockets = []
+
             try:
-                p = subprocess.Popen([sys.executable, script_name])
-
-                # Accept connections from both main and subinterpreter
-                responses = set()
-                while len(responses) < 2:  # Wait for both "ready:main" and "ready:sub"
-                    try:
-                        client_socket, _ = server_socket.accept()
-                        client_sockets.append(client_socket)
-
-                        # Read the response from this connection
-                        response = client_socket.recv(1024)
-                        if b"ready:main" in response:
-                            responses.add("main")
-                        if b"ready:sub" in response:
-                            responses.add("sub")
-                    except socket.timeout:
-                        break
-
-                server_socket.close()
-                stack_trace = get_stack_trace(p.pid)
-            except PermissionError:
-                self.skipTest(
-                    "Insufficient permissions to read the stack trace"
-                )
-            finally:
-                for client_socket in client_sockets:
-                    if client_socket is not None:
-                        client_socket.close()
-                p.kill()
-                p.terminate()
-                p.wait(timeout=SHORT_TIMEOUT)
-
-            # Verify we have multiple interpreters
-            self.assertGreaterEqual(len(stack_trace), 1, "Should have at least one interpreter")
-
-            # Look for main interpreter (ID 0) and subinterpreter (ID > 0)
-            main_interp = None
-            sub_interp = None
-
-            for interpreter_info in stack_trace:
-                if interpreter_info.interpreter_id == 0:
-                    main_interp = interpreter_info
-                elif interpreter_info.interpreter_id > 0:
-                    sub_interp = interpreter_info
-
-            self.assertIsNotNone(main_interp, "Main interpreter should be present")
-
-            # Check main interpreter has expected stack trace
-            main_found = False
-            for thread_info in main_interp.threads:
-                for frame in thread_info.frame_info:
-                    if frame.funcname == "main_worker":
-                        main_found = True
-                        break
-                if main_found:
-                    break
-            self.assertTrue(main_found, "Main interpreter should have main_worker in stack")
-
-            # If subinterpreter is present, check its stack trace
-            if sub_interp:
-                sub_found = False
-                for thread_info in sub_interp.threads:
-                    for frame in thread_info.frame_info:
-                        if frame.funcname in ("sub_worker", "nested_func"):
-                            sub_found = True
+                with _managed_subprocess([sys.executable, script_name]) as p:
+                    # Accept connections from both main and subinterpreter
+                    responses = set()
+                    while len(responses) < 2:
+                        try:
+                            client_socket, _ = server_socket.accept()
+                            client_sockets.append(client_socket)
+                            response = client_socket.recv(1024)
+                            if b"ready:main" in response:
+                                responses.add("main")
+                            if b"ready:sub" in response:
+                                responses.add("sub")
+                        except socket.timeout:
                             break
-                    if sub_found:
-                        break
-                self.assertTrue(sub_found, "Subinterpreter should have sub_worker or nested_func in stack")
+
+                    server_socket.close()
+                    server_socket = None
+
+                    try:
+                        stack_trace = get_stack_trace(p.pid)
+                    except PermissionError:
+                        self.skipTest(
+                            "Insufficient permissions to read the stack trace"
+                        )
+
+                    # Verify we have at least one interpreter
+                    self.assertGreaterEqual(len(stack_trace), 1)
+
+                    # Look for main interpreter (ID 0) and subinterpreter (ID > 0)
+                    main_interp = None
+                    sub_interp = None
+                    for interpreter_info in stack_trace:
+                        if interpreter_info.interpreter_id == 0:
+                            main_interp = interpreter_info
+                        elif interpreter_info.interpreter_id > 0:
+                            sub_interp = interpreter_info
+
+                    self.assertIsNotNone(
+                        main_interp, "Main interpreter should be present"
+                    )
+
+                    # Check main interpreter has expected stack trace
+                    main_found = self._find_frame_in_trace(
+                        [main_interp], lambda f: f.funcname == "main_worker"
+                    )
+                    self.assertIsNotNone(
+                        main_found,
+                        "Main interpreter should have main_worker in stack",
+                    )
+
+                    # If subinterpreter is present, check its stack trace
+                    if sub_interp:
+                        sub_found = self._find_frame_in_trace(
+                            [sub_interp],
+                            lambda f: f.funcname
+                            in ("sub_worker", "nested_func"),
+                        )
+                        self.assertIsNotNone(
+                            sub_found,
+                            "Subinterpreter should have sub_worker or nested_func in stack",
+                        )
+            finally:
+                _cleanup_sockets(*client_sockets, server_socket)
 
     @skip_if_not_supported
     @unittest.skipIf(
@@ -1312,14 +1557,11 @@ def run_subinterp():
     )
     @requires_subinterpreters
     def test_multiple_subinterpreters_with_threads(self):
-        # Test multiple subinterpreters, each with multiple threads
         port = find_unused_port()
 
-        # Calculate subinterpreter codes separately and pickle them
         import pickle
 
-        # Code for first subinterpreter with 2 threads
-        subinterp1_code = textwrap.dedent(f'''
+        subinterp1_code = textwrap.dedent(f"""
             import socket
             import time
             import threading
@@ -1346,10 +1588,9 @@ def nested_func():
             t2.start()
             t1.join()
             t2.join()
-        ''').strip()
+        """).strip()
 
-        # Code for second subinterpreter with 2 threads
-        subinterp2_code = textwrap.dedent(f'''
+        subinterp2_code = textwrap.dedent(f"""
             import socket
             import time
             import threading
@@ -1376,9 +1617,8 @@ def nested_func():
             t2.start()
             t1.join()
             t2.join()
-        ''').strip()
+        """).strip()
 
-        # Pickle the subinterpreter codes
         pickled_code1 = pickle.dumps(subinterp1_code)
         pickled_code2 = pickle.dumps(subinterp2_code)
 
@@ -1390,44 +1630,35 @@ def nested_func():
             import socket
             import threading
 
-            # Connect to the test process
             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             sock.connect(('localhost', {port}))
 
             def main_worker():
-                # Function running in main interpreter
                 sock.sendall(b"ready:main\\n")
                 time.sleep(10_000)
 
             def run_subinterp1():
-                # Create and run first subinterpreter
                 subinterp = interpreters.create()
-
                 import pickle
                 pickled_code = {pickled_code1!r}
                 subinterp_code = pickle.loads(pickled_code)
                 subinterp.exec(subinterp_code)
 
             def run_subinterp2():
-                # Create and run second subinterpreter
                 subinterp = interpreters.create()
-
                 import pickle
                 pickled_code = {pickled_code2!r}
                 subinterp_code = pickle.loads(pickled_code)
                 subinterp.exec(subinterp_code)
 
-            # Start subinterpreters in threads
             sub1_thread = threading.Thread(target=run_subinterp1)
             sub2_thread = threading.Thread(target=run_subinterp2)
             sub1_thread.start()
             sub2_thread.start()
 
-            # Start main thread work
             main_thread = threading.Thread(target=main_worker)
             main_thread.start()
 
-            # Keep main thread alive
             main_thread.join()
             sub1_thread.join()
             sub2_thread.join()
@@ -1438,72 +1669,80 @@ def run_subinterp2():
             script_dir = os.path.join(work_dir, "script_pkg")
             os.mkdir(script_dir)
 
-            # Create a socket server to communicate with the target process
-            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-            server_socket.bind(("localhost", port))
-            server_socket.settimeout(SHORT_TIMEOUT)
-            server_socket.listen(5)  # Allow multiple connections
-
+            server_socket = _create_server_socket(port, backlog=5)
             script_name = _make_test_script(script_dir, "script", script)
             client_sockets = []
+
             try:
-                p = subprocess.Popen([sys.executable, script_name])
+                with _managed_subprocess([sys.executable, script_name]) as p:
+                    # Accept connections from main and all subinterpreter threads
+                    expected_responses = {
+                        "ready:main",
+                        "ready:sub1-t1",
+                        "ready:sub1-t2",
+                        "ready:sub2-t1",
+                        "ready:sub2-t2",
+                    }
+                    responses = set()
 
-                # Accept connections from main and all subinterpreter threads
-                expected_responses = {"ready:main", "ready:sub1-t1", "ready:sub1-t2", "ready:sub2-t1", "ready:sub2-t2"}
-                responses = set()
+                    while len(responses) < 5:
+                        try:
+                            client_socket, _ = server_socket.accept()
+                            client_sockets.append(client_socket)
+                            response = client_socket.recv(1024)
+                            response_str = response.decode().strip()
+                            if response_str in expected_responses:
+                                responses.add(response_str)
+                        except socket.timeout:
+                            break
+
+                    server_socket.close()
+                    server_socket = None
 
-                while len(responses) < 5:  # Wait for all 5 ready signals
                     try:
-                        client_socket, _ = server_socket.accept()
-                        client_sockets.append(client_socket)
+                        stack_trace = get_stack_trace(p.pid)
+                    except PermissionError:
+                        self.skipTest(
+                            "Insufficient permissions to read the stack trace"
+                        )
 
-                        # Read the response from this connection
-                        response = client_socket.recv(1024)
-                        response_str = response.decode().strip()
-                        if response_str in expected_responses:
-                            responses.add(response_str)
-                    except socket.timeout:
-                        break
+                    # Verify we have multiple interpreters
+                    self.assertGreaterEqual(len(stack_trace), 2)
 
-                server_socket.close()
-                stack_trace = get_stack_trace(p.pid)
-            except PermissionError:
-                self.skipTest(
-                    "Insufficient permissions to read the stack trace"
-                )
+                    # Count interpreters by ID
+                    interpreter_ids = {
+                        interp.interpreter_id for interp in stack_trace
+                    }
+                    self.assertIn(
+                        0,
+                        interpreter_ids,
+                        "Main interpreter should be present",
+                    )
+                    self.assertGreaterEqual(len(interpreter_ids), 3)
+
+                    # Count total threads
+                    total_threads = sum(
+                        len(interp.threads) for interp in stack_trace
+                    )
+                    self.assertGreaterEqual(total_threads, 5)
+
+                    # Look for expected function names
+                    all_funcnames = set()
+                    for interpreter_info in stack_trace:
+                        for thread_info in interpreter_info.threads:
+                            for frame in thread_info.frame_info:
+                                all_funcnames.add(frame.funcname)
+
+                    expected_funcs = {
+                        "main_worker",
+                        "worker1",
+                        "worker2",
+                        "nested_func",
+                    }
+                    found_funcs = expected_funcs.intersection(all_funcnames)
+                    self.assertGreater(len(found_funcs), 0)
             finally:
-                for client_socket in client_sockets:
-                    if client_socket is not None:
-                        client_socket.close()
-                p.kill()
-                p.terminate()
-                p.wait(timeout=SHORT_TIMEOUT)
-
-            # Verify we have multiple interpreters
-            self.assertGreaterEqual(len(stack_trace), 2, "Should have at least two interpreters")
-
-            # Count interpreters by ID
-            interpreter_ids = {interp.interpreter_id for interp in stack_trace}
-            self.assertIn(0, interpreter_ids, "Main interpreter should be present")
-            self.assertGreaterEqual(len(interpreter_ids), 3, "Should have main + at least 2 subinterpreters")
-
-            # Count total threads across all interpreters
-            total_threads = sum(len(interp.threads) for interp in stack_trace)
-            self.assertGreaterEqual(total_threads, 5, "Should have at least 5 threads total")
-
-            # Look for expected function names in stack traces
-            all_funcnames = set()
-            for interpreter_info in stack_trace:
-                for thread_info in interpreter_info.threads:
-                    for frame in thread_info.frame_info:
-                        all_funcnames.add(frame.funcname)
-
-            # Should find functions from different interpreters and threads
-            expected_funcs = {"main_worker", "worker1", "worker2", "nested_func"}
-            found_funcs = expected_funcs.intersection(all_funcnames)
-            self.assertGreater(len(found_funcs), 0, f"Should find some expected functions, got: {all_funcnames}")
+                _cleanup_sockets(*client_sockets, server_socket)
 
     @skip_if_not_supported
     @unittest.skipIf(
@@ -1512,54 +1751,41 @@ def run_subinterp2():
     )
     @requires_gil_enabled("Free threaded builds don't have an 'active thread'")
     def test_only_active_thread(self):
-        # Test that only_active_thread parameter works correctly
         port = find_unused_port()
         script = textwrap.dedent(
             f"""\
             import time, sys, socket, threading
 
-            # Connect to the test process
             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             sock.connect(('localhost', {port}))
 
             def worker_thread(name, barrier, ready_event):
-                barrier.wait()  # Synchronize thread start
-                ready_event.wait()  # Wait for main thread signal
-                # Sleep to keep thread alive
+                barrier.wait()
+                ready_event.wait()
                 time.sleep(10_000)
 
             def main_work():
-                # Do busy work to hold the GIL
                 sock.sendall(b"working\\n")
                 count = 0
                 while count < 100000000:
                     count += 1
                     if count % 10000000 == 0:
-                        pass  # Keep main thread busy
+                        pass
                 sock.sendall(b"done\\n")
 
-            # Create synchronization primitives
             num_threads = 3
-            barrier = threading.Barrier(num_threads + 1)  # +1 for main thread
+            barrier = threading.Barrier(num_threads + 1)
             ready_event = threading.Event()
 
-            # Start worker threads
             threads = []
             for i in range(num_threads):
                 t = threading.Thread(target=worker_thread, args=(f"Worker-{{i}}", barrier, ready_event))
                 t.start()
                 threads.append(t)
 
-            # Wait for all threads to be ready
             barrier.wait()
-
-            # Signal ready to parent process
             sock.sendall(b"ready\\n")
-
-            # Signal threads to start waiting
             ready_event.set()
-
-            # Now do busy work to hold the GIL
             main_work()
             """
         )
@@ -1568,104 +1794,76 @@ def main_work():
             script_dir = os.path.join(work_dir, "script_pkg")
             os.mkdir(script_dir)
 
-            # Create a socket server to communicate with the target process
-            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-            server_socket.bind(("localhost", port))
-            server_socket.settimeout(SHORT_TIMEOUT)
-            server_socket.listen(1)
-
+            server_socket = _create_server_socket(port)
             script_name = _make_test_script(script_dir, "script", script)
             client_socket = None
+
             try:
-                p = subprocess.Popen([sys.executable, script_name])
-                client_socket, _ = server_socket.accept()
-                server_socket.close()
+                with _managed_subprocess([sys.executable, script_name]) as p:
+                    client_socket, _ = server_socket.accept()
+                    server_socket.close()
+                    server_socket = None
 
-                # Wait for ready signal
-                response = b""
-                while b"ready" not in response:
-                    response += client_socket.recv(1024)
+                    # Wait for ready and working signals
+                    _wait_for_signal(client_socket, [b"ready", b"working"])
 
-                # Wait for the main thread to start its busy work
-                while b"working" not in response:
-                    response += client_socket.recv(1024)
-
-                # Get stack trace with all threads
-                unwinder_all = RemoteUnwinder(p.pid, all_threads=True)
-                for _ in range(10):
-                    # Wait for the main thread to start its busy work
-                    all_traces = unwinder_all.get_stack_trace()
-                    found = False
-                    # New format: [InterpreterInfo(interpreter_id, [ThreadInfo(...)])]
-                    for interpreter_info in all_traces:
-                        for thread_info in interpreter_info.threads:
-                            if not thread_info.frame_info:
-                                continue
-                            current_frame = thread_info.frame_info[0]
-                            if (
-                                current_frame.funcname == "main_work"
-                                and current_frame.lineno > 15
-                            ):
-                                found = True
+                    try:
+                        # Get stack trace with all threads
+                        unwinder_all = RemoteUnwinder(p.pid, all_threads=True)
+                        for _ in range(MAX_TRIES):
+                            all_traces = unwinder_all.get_stack_trace()
+                            found = self._find_frame_in_trace(
+                                all_traces,
+                                lambda f: f.funcname == "main_work"
+                                and f.lineno > 12,
+                            )
+                            if found:
                                 break
-                        if found:
+                            time.sleep(0.1)
+                        else:
+                            self.fail(
+                                "Main thread did not start its busy work on time"
+                            )
+
+                        # Get stack trace with only GIL holder
+                        unwinder_gil = RemoteUnwinder(
+                            p.pid, only_active_thread=True
+                        )
+                        gil_traces = unwinder_gil.get_stack_trace()
+                    except PermissionError:
+                        self.skipTest(
+                            "Insufficient permissions to read the stack trace"
+                        )
+
+                    # Count threads
+                    total_threads = sum(
+                        len(interp.threads) for interp in all_traces
+                    )
+                    self.assertGreater(total_threads, 1)
+
+                    total_gil_threads = sum(
+                        len(interp.threads) for interp in gil_traces
+                    )
+                    self.assertEqual(total_gil_threads, 1)
+
+                    # Get the GIL holder thread ID
+                    gil_thread_id = None
+                    for interpreter_info in gil_traces:
+                        if interpreter_info.threads:
+                            gil_thread_id = interpreter_info.threads[
+                                0
+                            ].thread_id
                             break
 
-                    if found:
-                        break
-                    # Give a bit of time to take the next sample
-                    time.sleep(0.1)
-                else:
-                    self.fail(
-                        "Main thread did not start its busy work on time"
-                    )
+                    # Get all thread IDs
+                    all_thread_ids = []
+                    for interpreter_info in all_traces:
+                        for thread_info in interpreter_info.threads:
+                            all_thread_ids.append(thread_info.thread_id)
 
-                # Get stack trace with only GIL holder
-                unwinder_gil = RemoteUnwinder(p.pid, only_active_thread=True)
-                gil_traces = unwinder_gil.get_stack_trace()
-
-            except PermissionError:
-                self.skipTest(
-                    "Insufficient permissions to read the stack trace"
-                )
+                    self.assertIn(gil_thread_id, all_thread_ids)
             finally:
-                if client_socket is not None:
-                    client_socket.close()
-                p.kill()
-                p.terminate()
-                p.wait(timeout=SHORT_TIMEOUT)
-
-            # Count total threads across all interpreters in all_traces
-            total_threads = sum(len(interpreter_info.threads) for interpreter_info in all_traces)
-            self.assertGreater(
-                total_threads, 1, "Should have multiple threads"
-            )
-
-            # Count total threads across all interpreters in gil_traces
-            total_gil_threads = sum(len(interpreter_info.threads) for interpreter_info in gil_traces)
-            self.assertEqual(
-                total_gil_threads, 1, "Should have exactly one GIL holder"
-            )
-
-            # Get the GIL holder thread ID
-            gil_thread_id = None
-            for interpreter_info in gil_traces:
-                if interpreter_info.threads:
-                    gil_thread_id = interpreter_info.threads[0].thread_id
-                    break
-
-            # Get all thread IDs from all_traces
-            all_thread_ids = []
-            for interpreter_info in all_traces:
-                for thread_info in interpreter_info.threads:
-                    all_thread_ids.append(thread_info.thread_id)
-
-            self.assertIn(
-                gil_thread_id,
-                all_thread_ids,
-                "GIL holder should be among all threads",
-            )
+                _cleanup_sockets(client_socket, server_socket)
 
 
 class TestUnsupportedPlatformHandling(unittest.TestCase):
@@ -1673,23 +1871,28 @@ class TestUnsupportedPlatformHandling(unittest.TestCase):
         sys.platform in ("linux", "darwin", "win32"),
         "Test only runs on unsupported platforms (not Linux, macOS, or Windows)",
     )
-    @unittest.skipIf(sys.platform == "android", "Android raises Linux-specific exception")
+    @unittest.skipIf(
+        sys.platform == "android", "Android raises Linux-specific exception"
+    )
     def test_unsupported_platform_error(self):
         with self.assertRaises(RuntimeError) as cm:
             RemoteUnwinder(os.getpid())
 
         self.assertIn(
             "Reading the PyRuntime section is not supported on this platform",
-            str(cm.exception)
+            str(cm.exception),
         )
 
-class TestDetectionOfThreadStatus(unittest.TestCase):
-    @unittest.skipIf(
-        sys.platform not in ("linux", "darwin", "win32"),
-        "Test only runs on unsupported platforms (not Linux, macOS, or Windows)",
-    )
-    @unittest.skipIf(sys.platform == "android", "Android raises Linux-specific exception")
-    def test_thread_status_detection(self):
+
+class TestDetectionOfThreadStatus(RemoteInspectionTestBase):
+    def _run_thread_status_test(self, mode, check_condition):
+        """
+        Common pattern for thread status detection tests.
+
+        Args:
+            mode: Profiling mode (PROFILING_MODE_CPU, PROFILING_MODE_GIL, etc.)
+            check_condition: Function(statuses, sleeper_tid, busy_tid) -> bool
+        """
         port = find_unused_port()
         script = textwrap.dedent(
             f"""\
@@ -1722,203 +1925,146 @@ def busy():
             sock.close()
             """
         )
+
         with os_helper.temp_dir() as work_dir:
             script_dir = os.path.join(work_dir, "script_pkg")
             os.mkdir(script_dir)
-            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-            server_socket.bind(("localhost", port))
-            server_socket.settimeout(SHORT_TIMEOUT)
-            server_socket.listen(1)
 
-            script_name = _make_test_script(script_dir, "thread_status_script", script)
+            server_socket = _create_server_socket(port)
+            script_name = _make_test_script(
+                script_dir, "thread_status_script", script
+            )
             client_socket = None
+
             try:
-                p = subprocess.Popen([sys.executable, script_name])
-                client_socket, _ = server_socket.accept()
-                server_socket.close()
-                response = b""
-                sleeper_tid = None
-                busy_tid = None
-                while True:
-                    chunk = client_socket.recv(1024)
-                    response += chunk
-                    if b"ready:main" in response and b"ready:sleeper" in response and b"ready:busy" in response:
-                        # Parse TIDs from the response
-                        for line in response.split(b"\n"):
-                            if line.startswith(b"ready:sleeper:"):
-                                try:
-                                    sleeper_tid = int(line.split(b":")[-1])
-                                except Exception:
-                                    pass
-                            elif line.startswith(b"ready:busy:"):
-                                try:
-                                    busy_tid = int(line.split(b":")[-1])
-                                except Exception:
-                                    pass
-                        break
+                with _managed_subprocess([sys.executable, script_name]) as p:
+                    client_socket, _ = server_socket.accept()
+                    server_socket.close()
+                    server_socket = None
 
-                attempts = 10
-                statuses = {}
-                try:
-                    unwinder = RemoteUnwinder(p.pid, all_threads=True, mode=PROFILING_MODE_CPU,
-                                                skip_non_matching_threads=False)
-                    for _ in range(attempts):
-                        traces = unwinder.get_stack_trace()
-                        # Find threads and their statuses
-                        statuses = {}
-                        for interpreter_info in traces:
-                            for thread_info in interpreter_info.threads:
-                                statuses[thread_info.thread_id] = thread_info.status
-
-                        # Check if sleeper thread is off CPU and busy thread is on CPU
-                        # In the new flags system:
-                        # - sleeper should NOT have ON_CPU flag (off CPU)
-                        # - busy should have ON_CPU flag
-                        if (sleeper_tid in statuses and
-                            busy_tid in statuses and
-                            not (statuses[sleeper_tid] & THREAD_STATUS_ON_CPU) and
-                            (statuses[busy_tid] & THREAD_STATUS_ON_CPU)):
-                            break
-                        time.sleep(0.5)  # Give a bit of time to let threads settle
-                except PermissionError:
-                    self.skipTest(
-                        "Insufficient permissions to read the stack trace"
+                    # Wait for all ready signals and parse TIDs
+                    response = _wait_for_signal(
+                        client_socket,
+                        [b"ready:main", b"ready:sleeper", b"ready:busy"],
                     )
 
-                self.assertIsNotNone(sleeper_tid, "Sleeper thread id not received")
-                self.assertIsNotNone(busy_tid, "Busy thread id not received")
-                self.assertIn(sleeper_tid, statuses, "Sleeper tid not found in sampled threads")
-                self.assertIn(busy_tid, statuses, "Busy tid not found in sampled threads")
-                self.assertFalse(statuses[sleeper_tid] & THREAD_STATUS_ON_CPU, "Sleeper thread should be off CPU")
-                self.assertTrue(statuses[busy_tid] & THREAD_STATUS_ON_CPU, "Busy thread should be on CPU")
+                    sleeper_tid = None
+                    busy_tid = None
+                    for line in response.split(b"\n"):
+                        if line.startswith(b"ready:sleeper:"):
+                            try:
+                                sleeper_tid = int(line.split(b":")[-1])
+                            except (ValueError, IndexError):
+                                pass
+                        elif line.startswith(b"ready:busy:"):
+                            try:
+                                busy_tid = int(line.split(b":")[-1])
+                            except (ValueError, IndexError):
+                                pass
 
-            finally:
-                if client_socket is not None:
-                    client_socket.close()
-                p.terminate()
-                p.wait(timeout=SHORT_TIMEOUT)
-
-    @unittest.skipIf(
-        sys.platform not in ("linux", "darwin", "win32"),
-        "Test only runs on unsupported platforms (not Linux, macOS, or Windows)",
-    )
-    @unittest.skipIf(sys.platform == "android", "Android raises Linux-specific exception")
-    def test_thread_status_gil_detection(self):
-        port = find_unused_port()
-        script = textwrap.dedent(
-            f"""\
-            import time, sys, socket, threading
-            import os
-
-            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            sock.connect(('localhost', {port}))
-
-            def sleeper():
-                tid = threading.get_native_id()
-                sock.sendall(f'ready:sleeper:{{tid}}\\n'.encode())
-                time.sleep(10000)
-
-            def busy():
-                tid = threading.get_native_id()
-                sock.sendall(f'ready:busy:{{tid}}\\n'.encode())
-                x = 0
-                while True:
-                    x = x + 1
-                time.sleep(0.5)
-
-            t1 = threading.Thread(target=sleeper)
-            t2 = threading.Thread(target=busy)
-            t1.start()
-            t2.start()
-            sock.sendall(b'ready:main\\n')
-            t1.join()
-            t2.join()
-            sock.close()
-            """
-        )
-        with os_helper.temp_dir() as work_dir:
-            script_dir = os.path.join(work_dir, "script_pkg")
-            os.mkdir(script_dir)
-            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-            server_socket.bind(("localhost", port))
-            server_socket.settimeout(SHORT_TIMEOUT)
-            server_socket.listen(1)
-
-            script_name = _make_test_script(script_dir, "thread_status_script", script)
-            client_socket = None
-            try:
-                p = subprocess.Popen([sys.executable, script_name])
-                client_socket, _ = server_socket.accept()
-                server_socket.close()
-                response = b""
-                sleeper_tid = None
-                busy_tid = None
-                while True:
-                    chunk = client_socket.recv(1024)
-                    response += chunk
-                    if b"ready:main" in response and b"ready:sleeper" in response and b"ready:busy" in response:
-                        # Parse TIDs from the response
-                        for line in response.split(b"\n"):
-                            if line.startswith(b"ready:sleeper:"):
-                                try:
-                                    sleeper_tid = int(line.split(b":")[-1])
-                                except Exception:
-                                    pass
-                            elif line.startswith(b"ready:busy:"):
-                                try:
-                                    busy_tid = int(line.split(b":")[-1])
-                                except Exception:
-                                    pass
-                        break
-
-                attempts = 10
-                statuses = {}
-                try:
-                    unwinder = RemoteUnwinder(p.pid, all_threads=True, mode=PROFILING_MODE_GIL,
-                                                skip_non_matching_threads=False)
-                    for _ in range(attempts):
-                        traces = unwinder.get_stack_trace()
-                        # Find threads and their statuses
-                        statuses = {}
-                        for interpreter_info in traces:
-                            for thread_info in interpreter_info.threads:
-                                statuses[thread_info.thread_id] = thread_info.status
-
-                        # Check if sleeper thread doesn't have GIL and busy thread has GIL
-                        # In the new flags system:
-                        # - sleeper should NOT have HAS_GIL flag (waiting for GIL)
-                        # - busy should have HAS_GIL flag
-                        if (sleeper_tid in statuses and
-                            busy_tid in statuses and
-                            not (statuses[sleeper_tid] & THREAD_STATUS_HAS_GIL) and
-                            (statuses[busy_tid] & THREAD_STATUS_HAS_GIL)):
-                            break
-                        time.sleep(0.5)  # Give a bit of time to let threads settle
-                except PermissionError:
-                    self.skipTest(
-                        "Insufficient permissions to read the stack trace"
+                    self.assertIsNotNone(
+                        sleeper_tid, "Sleeper thread id not received"
+                    )
+                    self.assertIsNotNone(
+                        busy_tid, "Busy thread id not received"
                     )
 
-                self.assertIsNotNone(sleeper_tid, "Sleeper thread id not received")
-                self.assertIsNotNone(busy_tid, "Busy thread id not received")
-                self.assertIn(sleeper_tid, statuses, "Sleeper tid not found in sampled threads")
-                self.assertIn(busy_tid, statuses, "Busy tid not found in sampled threads")
-                self.assertFalse(statuses[sleeper_tid] & THREAD_STATUS_HAS_GIL, "Sleeper thread should not have GIL")
-                self.assertTrue(statuses[busy_tid] & THREAD_STATUS_HAS_GIL, "Busy thread should have GIL")
+                    # Sample until we see expected thread states
+                    statuses = {}
+                    try:
+                        unwinder = RemoteUnwinder(
+                            p.pid,
+                            all_threads=True,
+                            mode=mode,
+                            skip_non_matching_threads=False,
+                        )
+                        for _ in range(MAX_TRIES):
+                            traces = unwinder.get_stack_trace()
+                            statuses = self._get_thread_statuses(traces)
 
+                            if check_condition(
+                                statuses, sleeper_tid, busy_tid
+                            ):
+                                break
+                            time.sleep(0.5)
+                    except PermissionError:
+                        self.skipTest(
+                            "Insufficient permissions to read the stack trace"
+                        )
+
+                    return statuses, sleeper_tid, busy_tid
             finally:
-                if client_socket is not None:
-                    client_socket.close()
-                p.terminate()
-                p.wait(timeout=SHORT_TIMEOUT)
+                _cleanup_sockets(client_socket, server_socket)
 
     @unittest.skipIf(
         sys.platform not in ("linux", "darwin", "win32"),
         "Test only runs on supported platforms (Linux, macOS, or Windows)",
     )
-    @unittest.skipIf(sys.platform == "android", "Android raises Linux-specific exception")
+    @unittest.skipIf(
+        sys.platform == "android", "Android raises Linux-specific exception"
+    )
+    def test_thread_status_detection(self):
+        def check_cpu_status(statuses, sleeper_tid, busy_tid):
+            return (
+                sleeper_tid in statuses
+                and busy_tid in statuses
+                and not (statuses[sleeper_tid] & THREAD_STATUS_ON_CPU)
+                and (statuses[busy_tid] & THREAD_STATUS_ON_CPU)
+            )
+
+        statuses, sleeper_tid, busy_tid = self._run_thread_status_test(
+            PROFILING_MODE_CPU, check_cpu_status
+        )
+
+        self.assertIn(sleeper_tid, statuses)
+        self.assertIn(busy_tid, statuses)
+        self.assertFalse(
+            statuses[sleeper_tid] & THREAD_STATUS_ON_CPU,
+            "Sleeper thread should be off CPU",
+        )
+        self.assertTrue(
+            statuses[busy_tid] & THREAD_STATUS_ON_CPU,
+            "Busy thread should be on CPU",
+        )
+
+    @unittest.skipIf(
+        sys.platform not in ("linux", "darwin", "win32"),
+        "Test only runs on supported platforms (Linux, macOS, or Windows)",
+    )
+    @unittest.skipIf(
+        sys.platform == "android", "Android raises Linux-specific exception"
+    )
+    def test_thread_status_gil_detection(self):
+        def check_gil_status(statuses, sleeper_tid, busy_tid):
+            return (
+                sleeper_tid in statuses
+                and busy_tid in statuses
+                and not (statuses[sleeper_tid] & THREAD_STATUS_HAS_GIL)
+                and (statuses[busy_tid] & THREAD_STATUS_HAS_GIL)
+            )
+
+        statuses, sleeper_tid, busy_tid = self._run_thread_status_test(
+            PROFILING_MODE_GIL, check_gil_status
+        )
+
+        self.assertIn(sleeper_tid, statuses)
+        self.assertIn(busy_tid, statuses)
+        self.assertFalse(
+            statuses[sleeper_tid] & THREAD_STATUS_HAS_GIL,
+            "Sleeper thread should not have GIL",
+        )
+        self.assertTrue(
+            statuses[busy_tid] & THREAD_STATUS_HAS_GIL,
+            "Busy thread should have GIL",
+        )
+
+    @unittest.skipIf(
+        sys.platform not in ("linux", "darwin", "win32"),
+        "Test only runs on supported platforms (Linux, macOS, or Windows)",
+    )
+    @unittest.skipIf(
+        sys.platform == "android", "Android raises Linux-specific exception"
+    )
     def test_thread_status_all_mode_detection(self):
         port = find_unused_port()
         script = textwrap.dedent(
@@ -1951,91 +2097,967 @@ def busy_thread():
 
         with os_helper.temp_dir() as tmp_dir:
             script_file = make_script(tmp_dir, "script", script)
-            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-            server_socket.bind(("localhost", port))
-            server_socket.listen(2)
-            server_socket.settimeout(SHORT_TIMEOUT)
-
-            p = subprocess.Popen(
-                [sys.executable, script_file],
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-            )
-
+            server_socket = _create_server_socket(port, backlog=2)
             client_sockets = []
+
             try:
-                sleeper_tid = None
-                busy_tid = None
+                with _managed_subprocess(
+                    [sys.executable, script_file],
+                ) as p:
+                    sleeper_tid = None
+                    busy_tid = None
 
-                # Receive thread IDs from the child process
-                for _ in range(2):
-                    client_socket, _ = server_socket.accept()
-                    client_sockets.append(client_socket)
-                    line = client_socket.recv(1024)
-                    if line:
-                        if line.startswith(b"sleeper:"):
-                            try:
-                                sleeper_tid = int(line.split(b":")[-1])
-                            except Exception:
-                                pass
-                        elif line.startswith(b"busy:"):
-                            try:
-                                busy_tid = int(line.split(b":")[-1])
-                            except Exception:
-                                pass
+                    # Receive thread IDs from the child process
+                    for _ in range(2):
+                        client_socket, _ = server_socket.accept()
+                        client_sockets.append(client_socket)
+                        line = client_socket.recv(1024)
+                        if line:
+                            if line.startswith(b"sleeper:"):
+                                try:
+                                    sleeper_tid = int(line.split(b":")[-1])
+                                except (ValueError, IndexError):
+                                    pass
+                            elif line.startswith(b"busy:"):
+                                try:
+                                    busy_tid = int(line.split(b":")[-1])
+                                except (ValueError, IndexError):
+                                    pass
 
-                server_socket.close()
+                    server_socket.close()
+                    server_socket = None
 
-                attempts = 10
-                statuses = {}
-                try:
-                    unwinder = RemoteUnwinder(p.pid, all_threads=True, mode=PROFILING_MODE_ALL,
-                                                skip_non_matching_threads=False)
-                    for _ in range(attempts):
-                        traces = unwinder.get_stack_trace()
-                        # Find threads and their statuses
-                        statuses = {}
-                        for interpreter_info in traces:
-                            for thread_info in interpreter_info.threads:
-                                statuses[thread_info.thread_id] = thread_info.status
+                    statuses = {}
+                    try:
+                        unwinder = RemoteUnwinder(
+                            p.pid,
+                            all_threads=True,
+                            mode=PROFILING_MODE_ALL,
+                            skip_non_matching_threads=False,
+                        )
+                        for _ in range(MAX_TRIES):
+                            traces = unwinder.get_stack_trace()
+                            statuses = self._get_thread_statuses(traces)
 
-                        # Check ALL mode provides both GIL and CPU info
-                        # - sleeper should NOT have ON_CPU and NOT have HAS_GIL
-                        # - busy should have ON_CPU and have HAS_GIL
-                        if (sleeper_tid in statuses and
-                            busy_tid in statuses and
-                            not (statuses[sleeper_tid] & THREAD_STATUS_ON_CPU) and
-                            not (statuses[sleeper_tid] & THREAD_STATUS_HAS_GIL) and
-                            (statuses[busy_tid] & THREAD_STATUS_ON_CPU) and
-                            (statuses[busy_tid] & THREAD_STATUS_HAS_GIL)):
-                            break
-                        time.sleep(0.5)
-                except PermissionError:
-                    self.skipTest(
-                        "Insufficient permissions to read the stack trace"
+                            # Check ALL mode provides both GIL and CPU info
+                            if (
+                                sleeper_tid in statuses
+                                and busy_tid in statuses
+                                and not (
+                                    statuses[sleeper_tid]
+                                    & THREAD_STATUS_ON_CPU
+                                )
+                                and not (
+                                    statuses[sleeper_tid]
+                                    & THREAD_STATUS_HAS_GIL
+                                )
+                                and (statuses[busy_tid] & THREAD_STATUS_ON_CPU)
+                                and (
+                                    statuses[busy_tid] & THREAD_STATUS_HAS_GIL
+                                )
+                            ):
+                                break
+                            time.sleep(0.5)
+                    except PermissionError:
+                        self.skipTest(
+                            "Insufficient permissions to read the stack trace"
+                        )
+
+                    self.assertIsNotNone(
+                        sleeper_tid, "Sleeper thread id not received"
+                    )
+                    self.assertIsNotNone(
+                        busy_tid, "Busy thread id not received"
+                    )
+                    self.assertIn(sleeper_tid, statuses)
+                    self.assertIn(busy_tid, statuses)
+
+                    # Sleeper: off CPU, no GIL
+                    self.assertFalse(
+                        statuses[sleeper_tid] & THREAD_STATUS_ON_CPU,
+                        "Sleeper should be off CPU",
+                    )
+                    self.assertFalse(
+                        statuses[sleeper_tid] & THREAD_STATUS_HAS_GIL,
+                        "Sleeper should not have GIL",
                     )
 
-                self.assertIsNotNone(sleeper_tid, "Sleeper thread id not received")
-                self.assertIsNotNone(busy_tid, "Busy thread id not received")
-                self.assertIn(sleeper_tid, statuses, "Sleeper tid not found in sampled threads")
-                self.assertIn(busy_tid, statuses, "Busy tid not found in sampled threads")
-
-                # Sleeper thread: off CPU, no GIL
-                self.assertFalse(statuses[sleeper_tid] & THREAD_STATUS_ON_CPU, "Sleeper should be off CPU")
-                self.assertFalse(statuses[sleeper_tid] & THREAD_STATUS_HAS_GIL, "Sleeper should not have GIL")
-
-                # Busy thread: on CPU, has GIL
-                self.assertTrue(statuses[busy_tid] & THREAD_STATUS_ON_CPU, "Busy should be on CPU")
-                self.assertTrue(statuses[busy_tid] & THREAD_STATUS_HAS_GIL, "Busy should have GIL")
-
+                    # Busy: on CPU, has GIL
+                    self.assertTrue(
+                        statuses[busy_tid] & THREAD_STATUS_ON_CPU,
+                        "Busy should be on CPU",
+                    )
+                    self.assertTrue(
+                        statuses[busy_tid] & THREAD_STATUS_HAS_GIL,
+                        "Busy should have GIL",
+                    )
             finally:
-                for client_socket in client_sockets:
-                    client_socket.close()
-                p.terminate()
-                p.wait(timeout=SHORT_TIMEOUT)
-                p.stdout.close()
-                p.stderr.close()
+                _cleanup_sockets(*client_sockets, server_socket)
+
+
+class TestFrameCaching(RemoteInspectionTestBase):
+    """Test that frame caching produces correct results.
+
+    Uses socket-based synchronization for deterministic testing.
+    All tests verify cache reuse via object identity checks (assertIs).
+    """
+
+    @contextmanager
+    def _target_process(self, script_body):
+        """Context manager for running a target process with socket sync."""
+        port = find_unused_port()
+        script = f"""\
+import socket
+sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+sock.connect(('localhost', {port}))
+{textwrap.dedent(script_body)}
+"""
+
+        with os_helper.temp_dir() as work_dir:
+            script_dir = os.path.join(work_dir, "script_pkg")
+            os.mkdir(script_dir)
+
+            server_socket = _create_server_socket(port)
+            script_name = _make_test_script(script_dir, "script", script)
+            client_socket = None
+
+            try:
+                with _managed_subprocess([sys.executable, script_name]) as p:
+                    client_socket, _ = server_socket.accept()
+                    server_socket.close()
+                    server_socket = None
+
+                    def make_unwinder(cache_frames=True):
+                        return RemoteUnwinder(
+                            p.pid, all_threads=True, cache_frames=cache_frames
+                        )
+
+                    yield p, client_socket, make_unwinder
+
+            except PermissionError:
+                self.skipTest(
+                    "Insufficient permissions to read the stack trace"
+                )
+            finally:
+                _cleanup_sockets(client_socket, server_socket)
+
+    def _get_frames_with_retry(self, unwinder, required_funcs):
+        """Get frames containing required_funcs, with retry for transient errors."""
+        for _ in range(MAX_TRIES):
+            with contextlib.suppress(OSError, RuntimeError):
+                traces = unwinder.get_stack_trace()
+                for interp in traces:
+                    for thread in interp.threads:
+                        funcs = {f.funcname for f in thread.frame_info}
+                        if required_funcs.issubset(funcs):
+                            return thread.frame_info
+            time.sleep(0.1)
+        return None
+
+    def _sample_frames(
+        self,
+        client_socket,
+        unwinder,
+        wait_signal,
+        send_ack,
+        required_funcs,
+        expected_frames=1,
+    ):
+        """Wait for signal, sample frames with retry until required funcs present, send ack."""
+        _wait_for_signal(client_socket, wait_signal)
+        frames = None
+        for _ in range(MAX_TRIES):
+            frames = self._get_frames_with_retry(unwinder, required_funcs)
+            if frames and len(frames) >= expected_frames:
+                break
+            time.sleep(0.1)
+        client_socket.sendall(send_ack)
+        return frames
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_hit_same_stack(self):
+        """Test that consecutive samples reuse cached parent frame objects.
+
+        The current frame (index 0) is always re-read from memory to get
+        updated line numbers, so it may be a different object. Parent frames
+        (index 1+) should be identical objects from cache.
+        """
+        script_body = """\
+            def level3():
+                sock.sendall(b"sync1")
+                sock.recv(16)
+                sock.sendall(b"sync2")
+                sock.recv(16)
+                sock.sendall(b"sync3")
+                sock.recv(16)
+
+            def level2():
+                level3()
+
+            def level1():
+                level2()
+
+            level1()
+            """
+
+        with self._target_process(script_body) as (
+            p,
+            client_socket,
+            make_unwinder,
+        ):
+            unwinder = make_unwinder(cache_frames=True)
+            expected = {"level1", "level2", "level3"}
+
+            frames1 = self._sample_frames(
+                client_socket, unwinder, b"sync1", b"ack", expected
+            )
+            frames2 = self._sample_frames(
+                client_socket, unwinder, b"sync2", b"ack", expected
+            )
+            frames3 = self._sample_frames(
+                client_socket, unwinder, b"sync3", b"done", expected
+            )
+
+        self.assertIsNotNone(frames1)
+        self.assertIsNotNone(frames2)
+        self.assertIsNotNone(frames3)
+        self.assertEqual(len(frames1), len(frames2))
+        self.assertEqual(len(frames2), len(frames3))
+
+        # Current frame (index 0) is always re-read, so check value equality
+        self.assertEqual(frames1[0].funcname, frames2[0].funcname)
+        self.assertEqual(frames2[0].funcname, frames3[0].funcname)
+
+        # Parent frames (index 1+) must be identical objects (cache reuse)
+        for i in range(1, len(frames1)):
+            f1, f2, f3 = frames1[i], frames2[i], frames3[i]
+            self.assertIs(
+                f1, f2, f"Frame {i}: samples 1-2 must be same object"
+            )
+            self.assertIs(
+                f2, f3, f"Frame {i}: samples 2-3 must be same object"
+            )
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_line_number_updates_in_same_frame(self):
+        """Test that line numbers are correctly updated when execution moves within a function.
+
+        When the profiler samples at different points within the same function,
+        it must report the correct line number for each sample, not stale cached values.
+        """
+        script_body = """\
+            def outer():
+                inner()
+
+            def inner():
+                sock.sendall(b"line_a"); sock.recv(16)
+                sock.sendall(b"line_b"); sock.recv(16)
+                sock.sendall(b"line_c"); sock.recv(16)
+                sock.sendall(b"line_d"); sock.recv(16)
+
+            outer()
+            """
+
+        with self._target_process(script_body) as (
+            p,
+            client_socket,
+            make_unwinder,
+        ):
+            unwinder = make_unwinder(cache_frames=True)
+
+            frames_a = self._sample_frames(
+                client_socket, unwinder, b"line_a", b"ack", {"inner"}
+            )
+            frames_b = self._sample_frames(
+                client_socket, unwinder, b"line_b", b"ack", {"inner"}
+            )
+            frames_c = self._sample_frames(
+                client_socket, unwinder, b"line_c", b"ack", {"inner"}
+            )
+            frames_d = self._sample_frames(
+                client_socket, unwinder, b"line_d", b"done", {"inner"}
+            )
+
+        self.assertIsNotNone(frames_a)
+        self.assertIsNotNone(frames_b)
+        self.assertIsNotNone(frames_c)
+        self.assertIsNotNone(frames_d)
+
+        # Get the 'inner' frame from each sample (should be index 0)
+        inner_a = frames_a[0]
+        inner_b = frames_b[0]
+        inner_c = frames_c[0]
+        inner_d = frames_d[0]
+
+        self.assertEqual(inner_a.funcname, "inner")
+        self.assertEqual(inner_b.funcname, "inner")
+        self.assertEqual(inner_c.funcname, "inner")
+        self.assertEqual(inner_d.funcname, "inner")
+
+        # Line numbers must be different and increasing (execution moves forward)
+        self.assertLess(
+            inner_a.lineno, inner_b.lineno, "Line B should be after line A"
+        )
+        self.assertLess(
+            inner_b.lineno, inner_c.lineno, "Line C should be after line B"
+        )
+        self.assertLess(
+            inner_c.lineno, inner_d.lineno, "Line D should be after line C"
+        )
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_invalidation_on_return(self):
+        """Test cache invalidation when stack shrinks (function returns)."""
+        script_body = """\
+            def inner():
+                sock.sendall(b"at_inner")
+                sock.recv(16)
+
+            def outer():
+                inner()
+                sock.sendall(b"at_outer")
+                sock.recv(16)
+
+            outer()
+            """
+
+        with self._target_process(script_body) as (
+            p,
+            client_socket,
+            make_unwinder,
+        ):
+            unwinder = make_unwinder(cache_frames=True)
+
+            frames_deep = self._sample_frames(
+                client_socket,
+                unwinder,
+                b"at_inner",
+                b"ack",
+                {"inner", "outer"},
+            )
+            frames_shallow = self._sample_frames(
+                client_socket, unwinder, b"at_outer", b"done", {"outer"}
+            )
+
+        self.assertIsNotNone(frames_deep)
+        self.assertIsNotNone(frames_shallow)
+
+        funcs_deep = [f.funcname for f in frames_deep]
+        funcs_shallow = [f.funcname for f in frames_shallow]
+
+        self.assertIn("inner", funcs_deep)
+        self.assertIn("outer", funcs_deep)
+        self.assertNotIn("inner", funcs_shallow)
+        self.assertIn("outer", funcs_shallow)
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_invalidation_on_call(self):
+        """Test cache invalidation when stack grows (new function called)."""
+        script_body = """\
+            def deeper():
+                sock.sendall(b"at_deeper")
+                sock.recv(16)
+
+            def middle():
+                sock.sendall(b"at_middle")
+                sock.recv(16)
+                deeper()
+
+            def top():
+                middle()
+
+            top()
+            """
+
+        with self._target_process(script_body) as (
+            p,
+            client_socket,
+            make_unwinder,
+        ):
+            unwinder = make_unwinder(cache_frames=True)
+
+            frames_before = self._sample_frames(
+                client_socket,
+                unwinder,
+                b"at_middle",
+                b"ack",
+                {"middle", "top"},
+            )
+            frames_after = self._sample_frames(
+                client_socket,
+                unwinder,
+                b"at_deeper",
+                b"done",
+                {"deeper", "middle", "top"},
+            )
+
+        self.assertIsNotNone(frames_before)
+        self.assertIsNotNone(frames_after)
+
+        funcs_before = [f.funcname for f in frames_before]
+        funcs_after = [f.funcname for f in frames_after]
+
+        self.assertIn("middle", funcs_before)
+        self.assertIn("top", funcs_before)
+        self.assertNotIn("deeper", funcs_before)
+
+        self.assertIn("deeper", funcs_after)
+        self.assertIn("middle", funcs_after)
+        self.assertIn("top", funcs_after)
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_partial_stack_reuse(self):
+        """Test that unchanged bottom frames are reused when top changes (A→B→C to A→B→D)."""
+        script_body = """\
+            def func_c():
+                sock.sendall(b"at_c")
+                sock.recv(16)
+
+            def func_d():
+                sock.sendall(b"at_d")
+                sock.recv(16)
+
+            def func_b():
+                func_c()
+                func_d()
+
+            def func_a():
+                func_b()
+
+            func_a()
+            """
+
+        with self._target_process(script_body) as (
+            p,
+            client_socket,
+            make_unwinder,
+        ):
+            unwinder = make_unwinder(cache_frames=True)
+
+            # Sample at C: stack is A→B→C
+            frames_c = self._sample_frames(
+                client_socket,
+                unwinder,
+                b"at_c",
+                b"ack",
+                {"func_a", "func_b", "func_c"},
+            )
+            # Sample at D: stack is A→B→D (C returned, D called)
+            frames_d = self._sample_frames(
+                client_socket,
+                unwinder,
+                b"at_d",
+                b"done",
+                {"func_a", "func_b", "func_d"},
+            )
+
+        self.assertIsNotNone(frames_c)
+        self.assertIsNotNone(frames_d)
+
+        # Find func_a and func_b frames in both samples
+        def find_frame(frames, funcname):
+            for f in frames:
+                if f.funcname == funcname:
+                    return f
+            return None
+
+        frame_a_in_c = find_frame(frames_c, "func_a")
+        frame_b_in_c = find_frame(frames_c, "func_b")
+        frame_a_in_d = find_frame(frames_d, "func_a")
+        frame_b_in_d = find_frame(frames_d, "func_b")
+
+        self.assertIsNotNone(frame_a_in_c)
+        self.assertIsNotNone(frame_b_in_c)
+        self.assertIsNotNone(frame_a_in_d)
+        self.assertIsNotNone(frame_b_in_d)
+
+        # The bottom frames (A, B) should be the SAME objects (cache reuse)
+        self.assertIs(
+            frame_a_in_c,
+            frame_a_in_d,
+            "func_a frame should be reused from cache",
+        )
+        self.assertIs(
+            frame_b_in_c,
+            frame_b_in_d,
+            "func_b frame should be reused from cache",
+        )
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_recursive_frames(self):
+        """Test caching with same function appearing multiple times (recursion)."""
+        script_body = """\
+            def recurse(n):
+                if n <= 0:
+                    sock.sendall(b"sync1")
+                    sock.recv(16)
+                    sock.sendall(b"sync2")
+                    sock.recv(16)
+                else:
+                    recurse(n - 1)
+
+            recurse(5)
+            """
+
+        with self._target_process(script_body) as (
+            p,
+            client_socket,
+            make_unwinder,
+        ):
+            unwinder = make_unwinder(cache_frames=True)
+
+            frames1 = self._sample_frames(
+                client_socket, unwinder, b"sync1", b"ack", {"recurse"}
+            )
+            frames2 = self._sample_frames(
+                client_socket, unwinder, b"sync2", b"done", {"recurse"}
+            )
+
+        self.assertIsNotNone(frames1)
+        self.assertIsNotNone(frames2)
+
+        # Should have multiple "recurse" frames (6 total: recurse(5) down to recurse(0))
+        recurse_count = sum(1 for f in frames1 if f.funcname == "recurse")
+        self.assertEqual(recurse_count, 6, "Should have 6 recursive frames")
+
+        self.assertEqual(len(frames1), len(frames2))
+
+        # Current frame (index 0) is re-read, check value equality
+        self.assertEqual(frames1[0].funcname, frames2[0].funcname)
+
+        # Parent frames (index 1+) should be identical objects (cache reuse)
+        for i in range(1, len(frames1)):
+            self.assertIs(
+                frames1[i],
+                frames2[i],
+                f"Frame {i}: recursive frames must be same object",
+            )
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_vs_no_cache_equivalence(self):
+        """Test that cache_frames=True and cache_frames=False produce equivalent results."""
+        script_body = """\
+            def level3():
+                sock.sendall(b"ready"); sock.recv(16)
+
+            def level2():
+                level3()
+
+            def level1():
+                level2()
+
+            level1()
+            """
+
+        with self._target_process(script_body) as (
+            p,
+            client_socket,
+            make_unwinder,
+        ):
+            _wait_for_signal(client_socket, b"ready")
+
+            # Sample with cache
+            unwinder_cache = make_unwinder(cache_frames=True)
+            frames_cached = self._get_frames_with_retry(
+                unwinder_cache, {"level1", "level2", "level3"}
+            )
+
+            # Sample without cache
+            unwinder_no_cache = make_unwinder(cache_frames=False)
+            frames_no_cache = self._get_frames_with_retry(
+                unwinder_no_cache, {"level1", "level2", "level3"}
+            )
+
+            client_socket.sendall(b"done")
+
+        self.assertIsNotNone(frames_cached)
+        self.assertIsNotNone(frames_no_cache)
+
+        # Same number of frames
+        self.assertEqual(len(frames_cached), len(frames_no_cache))
+
+        # Same function names in same order
+        funcs_cached = [f.funcname for f in frames_cached]
+        funcs_no_cache = [f.funcname for f in frames_no_cache]
+        self.assertEqual(funcs_cached, funcs_no_cache)
+
+        # Same line numbers
+        lines_cached = [f.lineno for f in frames_cached]
+        lines_no_cache = [f.lineno for f in frames_no_cache]
+        self.assertEqual(lines_cached, lines_no_cache)
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_per_thread_isolation(self):
+        """Test that frame cache is per-thread and cache invalidation works independently."""
+        script_body = """\
+            import threading
+
+            lock = threading.Lock()
+
+            def sync(msg):
+                with lock:
+                    sock.sendall(msg + b"\\n")
+                    sock.recv(1)
+
+            # Thread 1 functions
+            def baz1():
+                sync(b"t1:baz1")
+
+            def bar1():
+                baz1()
+
+            def blech1():
+                sync(b"t1:blech1")
+
+            def foo1():
+                bar1()  # Goes down to baz1, syncs
+                blech1()  # Returns up, goes down to blech1, syncs
+
+            # Thread 2 functions
+            def baz2():
+                sync(b"t2:baz2")
+
+            def bar2():
+                baz2()
+
+            def blech2():
+                sync(b"t2:blech2")
+
+            def foo2():
+                bar2()  # Goes down to baz2, syncs
+                blech2()  # Returns up, goes down to blech2, syncs
+
+            t1 = threading.Thread(target=foo1)
+            t2 = threading.Thread(target=foo2)
+            t1.start()
+            t2.start()
+            t1.join()
+            t2.join()
+            """
+
+        with self._target_process(script_body) as (
+            p,
+            client_socket,
+            make_unwinder,
+        ):
+            unwinder = make_unwinder(cache_frames=True)
+
+            # Message dispatch table: signal -> required functions for that thread
+            dispatch = {
+                b"t1:baz1": {"baz1", "bar1", "foo1"},
+                b"t2:baz2": {"baz2", "bar2", "foo2"},
+                b"t1:blech1": {"blech1", "foo1"},
+                b"t2:blech2": {"blech2", "foo2"},
+            }
+
+            # Track results for each sync point
+            results = {}
+
+            # Process 4 sync points (order depends on thread scheduling)
+            buffer = _wait_for_signal(client_socket, b"\n")
+            for i in range(4):
+                # Extract first message from buffer
+                msg, sep, buffer = buffer.partition(b"\n")
+                self.assertIn(msg, dispatch, f"Unexpected message: {msg!r}")
+
+                # Sample frames for the thread at this sync point
+                required_funcs = dispatch[msg]
+                frames = self._get_frames_with_retry(unwinder, required_funcs)
+                self.assertIsNotNone(frames, f"Thread not found for {msg!r}")
+                results[msg] = [f.funcname for f in frames]
+
+                # Release thread and wait for next message (if not last)
+                client_socket.sendall(b"k")
+                if i < 3:
+                    buffer += _wait_for_signal(client_socket, b"\n")
+
+            # Validate Phase 1: baz snapshots
+            t1_baz = results.get(b"t1:baz1")
+            t2_baz = results.get(b"t2:baz2")
+            self.assertIsNotNone(t1_baz, "Missing t1:baz1 snapshot")
+            self.assertIsNotNone(t2_baz, "Missing t2:baz2 snapshot")
+
+            # Thread 1 at baz1: should have foo1->bar1->baz1
+            self.assertIn("baz1", t1_baz)
+            self.assertIn("bar1", t1_baz)
+            self.assertIn("foo1", t1_baz)
+            self.assertNotIn("blech1", t1_baz)
+            # No cross-contamination
+            self.assertNotIn("baz2", t1_baz)
+            self.assertNotIn("bar2", t1_baz)
+            self.assertNotIn("foo2", t1_baz)
+
+            # Thread 2 at baz2: should have foo2->bar2->baz2
+            self.assertIn("baz2", t2_baz)
+            self.assertIn("bar2", t2_baz)
+            self.assertIn("foo2", t2_baz)
+            self.assertNotIn("blech2", t2_baz)
+            # No cross-contamination
+            self.assertNotIn("baz1", t2_baz)
+            self.assertNotIn("bar1", t2_baz)
+            self.assertNotIn("foo1", t2_baz)
+
+            # Validate Phase 2: blech snapshots (cache invalidation test)
+            t1_blech = results.get(b"t1:blech1")
+            t2_blech = results.get(b"t2:blech2")
+            self.assertIsNotNone(t1_blech, "Missing t1:blech1 snapshot")
+            self.assertIsNotNone(t2_blech, "Missing t2:blech2 snapshot")
+
+            # Thread 1 at blech1: bar1/baz1 should be GONE (cache invalidated)
+            self.assertIn("blech1", t1_blech)
+            self.assertIn("foo1", t1_blech)
+            self.assertNotIn(
+                "bar1", t1_blech, "Cache not invalidated: bar1 still present"
+            )
+            self.assertNotIn(
+                "baz1", t1_blech, "Cache not invalidated: baz1 still present"
+            )
+            # No cross-contamination
+            self.assertNotIn("blech2", t1_blech)
+
+            # Thread 2 at blech2: bar2/baz2 should be GONE (cache invalidated)
+            self.assertIn("blech2", t2_blech)
+            self.assertIn("foo2", t2_blech)
+            self.assertNotIn(
+                "bar2", t2_blech, "Cache not invalidated: bar2 still present"
+            )
+            self.assertNotIn(
+                "baz2", t2_blech, "Cache not invalidated: baz2 still present"
+            )
+            # No cross-contamination
+            self.assertNotIn("blech1", t2_blech)
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_new_unwinder_with_stale_last_profiled_frame(self):
+        """Test that a new unwinder returns complete stack when cache lookup misses."""
+        script_body = """\
+            def level4():
+                sock.sendall(b"sync1")
+                sock.recv(16)
+                sock.sendall(b"sync2")
+                sock.recv(16)
+
+            def level3():
+                level4()
+
+            def level2():
+                level3()
+
+            def level1():
+                level2()
+
+            level1()
+            """
+
+        with self._target_process(script_body) as (
+            p,
+            client_socket,
+            make_unwinder,
+        ):
+            expected = {"level1", "level2", "level3", "level4"}
+
+            # First unwinder samples - this sets last_profiled_frame in target
+            unwinder1 = make_unwinder(cache_frames=True)
+            frames1 = self._sample_frames(
+                client_socket, unwinder1, b"sync1", b"ack", expected
+            )
+
+            # Create NEW unwinder (empty cache) and sample
+            # The target still has last_profiled_frame set from unwinder1
+            unwinder2 = make_unwinder(cache_frames=True)
+            frames2 = self._sample_frames(
+                client_socket, unwinder2, b"sync2", b"done", expected
+            )
+
+        self.assertIsNotNone(frames1)
+        self.assertIsNotNone(frames2)
+
+        funcs1 = [f.funcname for f in frames1]
+        funcs2 = [f.funcname for f in frames2]
+
+        # Both should have all levels
+        for level in ["level1", "level2", "level3", "level4"]:
+            self.assertIn(level, funcs1, f"{level} missing from first sample")
+            self.assertIn(level, funcs2, f"{level} missing from second sample")
+
+        # Should have same stack depth
+        self.assertEqual(
+            len(frames1),
+            len(frames2),
+            "New unwinder should return complete stack despite stale last_profiled_frame",
+        )
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_cache_exhaustion(self):
+        """Test cache works when frame limit (1024) is exceeded.
+
+        FRAME_CACHE_MAX_FRAMES=1024. With 1100 recursive frames,
+        the cache can't store all of them but should still work.
+        """
+        # Use 1100 to exceed FRAME_CACHE_MAX_FRAMES=1024
+        depth = 1100
+        script_body = f"""\
+import sys
+sys.setrecursionlimit(2000)
+
+def recurse(n):
+    if n <= 0:
+        sock.sendall(b"ready")
+        sock.recv(16)  # wait for ack
+        sock.sendall(b"ready2")
+        sock.recv(16)  # wait for done
+        return
+    recurse(n - 1)
+
+recurse({depth})
+"""
+
+        with self._target_process(script_body) as (
+            p,
+            client_socket,
+            make_unwinder,
+        ):
+            unwinder_cache = make_unwinder(cache_frames=True)
+            unwinder_no_cache = make_unwinder(cache_frames=False)
+
+            frames_cached = self._sample_frames(
+                client_socket,
+                unwinder_cache,
+                b"ready",
+                b"ack",
+                {"recurse"},
+                expected_frames=1102,
+            )
+            # Sample again with no cache for comparison
+            frames_no_cache = self._sample_frames(
+                client_socket,
+                unwinder_no_cache,
+                b"ready2",
+                b"done",
+                {"recurse"},
+                expected_frames=1102,
+            )
+
+        self.assertIsNotNone(frames_cached)
+        self.assertIsNotNone(frames_no_cache)
+
+        # Both should have many recurse frames (> 1024 limit)
+        cached_count = [f.funcname for f in frames_cached].count("recurse")
+        no_cache_count = [f.funcname for f in frames_no_cache].count("recurse")
+
+        self.assertGreater(
+            cached_count, 1000, "Should have >1000 recurse frames"
+        )
+        self.assertGreater(
+            no_cache_count, 1000, "Should have >1000 recurse frames"
+        )
+
+        # Both modes should produce same frame count
+        self.assertEqual(
+            len(frames_cached),
+            len(frames_no_cache),
+            "Cache exhaustion should not affect stack completeness",
+        )
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_get_stats(self):
+        """Test that get_stats() returns statistics when stats=True."""
+        script_body = """\
+            sock.sendall(b"ready")
+            sock.recv(16)
+            """
+
+        with self._target_process(script_body) as (p, client_socket, _):
+            unwinder = RemoteUnwinder(p.pid, all_threads=True, stats=True)
+            _wait_for_signal(client_socket, b"ready")
+
+            # Take a sample
+            unwinder.get_stack_trace()
+
+            stats = unwinder.get_stats()
+            client_socket.sendall(b"done")
+
+        # Verify expected keys exist
+        expected_keys = [
+            "total_samples",
+            "frame_cache_hits",
+            "frame_cache_misses",
+            "frame_cache_partial_hits",
+            "frames_read_from_cache",
+            "frames_read_from_memory",
+            "frame_cache_hit_rate",
+        ]
+        for key in expected_keys:
+            self.assertIn(key, stats)
+
+        self.assertEqual(stats["total_samples"], 1)
+
+    @skip_if_not_supported
+    @unittest.skipIf(
+        sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+        "Test only runs on Linux with process_vm_readv support",
+    )
+    def test_get_stats_disabled_raises(self):
+        """Test that get_stats() raises RuntimeError when stats=False."""
+        script_body = """\
+            sock.sendall(b"ready")
+            sock.recv(16)
+            """
+
+        with self._target_process(script_body) as (p, client_socket, _):
+            unwinder = RemoteUnwinder(
+                p.pid, all_threads=True
+            )  # stats=False by default
+            _wait_for_signal(client_socket, b"ready")
+
+            with self.assertRaises(RuntimeError):
+                unwinder.get_stats()
+
+            client_socket.sendall(b"done")
 
 
 if __name__ == "__main__":
diff --git a/Lib/test/test_profiling/test_heatmap.py b/Lib/test/test_profiling/test_heatmap.py
index a6ff3b83ea1..24bf3d21c2f 100644
--- a/Lib/test/test_profiling/test_heatmap.py
+++ b/Lib/test/test_profiling/test_heatmap.py
@@ -147,12 +147,6 @@ def test_init_sets_total_samples_to_zero(self):
         collector = HeatmapCollector(sample_interval_usec=100)
         self.assertEqual(collector._total_samples, 0)
 
-    def test_init_creates_color_cache(self):
-        """Test that color cache is initialized."""
-        collector = HeatmapCollector(sample_interval_usec=100)
-        self.assertIsInstance(collector._color_cache, dict)
-        self.assertEqual(len(collector._color_cache), 0)
-
     def test_init_gets_path_info(self):
         """Test that path info is retrieved during init."""
         collector = HeatmapCollector(sample_interval_usec=100)
diff --git a/Lib/test/test_profiling/test_sampling_profiler/helpers.py b/Lib/test/test_profiling/test_sampling_profiler/helpers.py
index f1c01afd0fa..0e32d8dd9ea 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/helpers.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/helpers.py
@@ -38,12 +38,88 @@
 SubprocessInfo = namedtuple("SubprocessInfo", ["process", "socket"])
 
 
+def _wait_for_signal(sock, expected_signals, timeout=SHORT_TIMEOUT):
+    """
+    Wait for expected signal(s) from a socket with proper timeout and EOF handling.
+
+    Args:
+        sock: Connected socket to read from
+        expected_signals: Single bytes object or list of bytes objects to wait for
+        timeout: Socket timeout in seconds
+
+    Returns:
+        bytes: Complete accumulated response buffer
+
+    Raises:
+        RuntimeError: If connection closed before signal received or timeout
+    """
+    if isinstance(expected_signals, bytes):
+        expected_signals = [expected_signals]
+
+    sock.settimeout(timeout)
+    buffer = b""
+
+    while True:
+        # Check if all expected signals are in buffer
+        if all(sig in buffer for sig in expected_signals):
+            return buffer
+
+        try:
+            chunk = sock.recv(4096)
+            if not chunk:
+                raise RuntimeError(
+                    f"Connection closed before receiving expected signals. "
+                    f"Expected: {expected_signals}, Got: {buffer[-200:]!r}"
+                )
+            buffer += chunk
+        except socket.timeout:
+            raise RuntimeError(
+                f"Timeout waiting for signals. "
+                f"Expected: {expected_signals}, Got: {buffer[-200:]!r}"
+            ) from None
+        except OSError as e:
+            raise RuntimeError(
+                f"Socket error while waiting for signals: {e}. "
+                f"Expected: {expected_signals}, Got: {buffer[-200:]!r}"
+            ) from None
+
+
+def _cleanup_sockets(*sockets):
+    """Safely close multiple sockets, ignoring errors."""
+    for sock in sockets:
+        if sock is not None:
+            try:
+                sock.close()
+            except OSError:
+                pass
+
+
+def _cleanup_process(proc, timeout=SHORT_TIMEOUT):
+    """Terminate a process gracefully, escalating to kill if needed."""
+    if proc.poll() is not None:
+        return
+    proc.terminate()
+    try:
+        proc.wait(timeout=timeout)
+        return
+    except subprocess.TimeoutExpired:
+        pass
+    proc.kill()
+    try:
+        proc.wait(timeout=timeout)
+    except subprocess.TimeoutExpired:
+        pass  # Process refuses to die, nothing more we can do
+
+
 @contextlib.contextmanager
-def test_subprocess(script):
+def test_subprocess(script, wait_for_working=False):
     """Context manager to create a test subprocess with socket synchronization.
 
     Args:
-        script: Python code to execute in the subprocess
+        script: Python code to execute in the subprocess. If wait_for_working
+                is True, script should send b"working" after starting work.
+        wait_for_working: If True, wait for both "ready" and "working" signals.
+                         Default False for backward compatibility.
 
     Yields:
         SubprocessInfo: Named tuple with process and socket objects
@@ -80,19 +156,18 @@ def test_subprocess(script):
         # Wait for process to connect and send ready signal
         client_socket, _ = server_socket.accept()
         server_socket.close()
-        response = client_socket.recv(1024)
-        if response != b"ready":
-            raise RuntimeError(
-                f"Unexpected response from subprocess: {response!r}"
-            )
+        server_socket = None
+
+        # Wait for ready signal, and optionally working signal
+        if wait_for_working:
+            _wait_for_signal(client_socket, [b"ready", b"working"])
+        else:
+            _wait_for_signal(client_socket, b"ready")
 
         yield SubprocessInfo(proc, client_socket)
     finally:
-        if client_socket is not None:
-            client_socket.close()
-        if proc.poll() is None:
-            proc.kill()
-        proc.wait()
+        _cleanup_sockets(client_socket, server_socket)
+        _cleanup_process(proc)
 
 
 def close_and_unlink(file):
diff --git a/Lib/test/test_profiling/test_sampling_profiler/mocks.py b/Lib/test/test_profiling/test_sampling_profiler/mocks.py
index 9f1cd5b83e0..7083362c771 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/mocks.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/mocks.py
@@ -36,3 +36,38 @@ def __init__(self, interpreter_id, threads):
 
     def __repr__(self):
         return f"MockInterpreterInfo(interpreter_id={self.interpreter_id}, threads={self.threads})"
+
+
+class MockCoroInfo:
+    """Mock CoroInfo for testing async tasks."""
+
+    def __init__(self, task_name, call_stack):
+        self.task_name = task_name  # In reality, this is the parent task ID
+        self.call_stack = call_stack
+
+    def __repr__(self):
+        return f"MockCoroInfo(task_name={self.task_name}, call_stack={self.call_stack})"
+
+
+class MockTaskInfo:
+    """Mock TaskInfo for testing async tasks."""
+
+    def __init__(self, task_id, task_name, coroutine_stack, awaited_by=None):
+        self.task_id = task_id
+        self.task_name = task_name
+        self.coroutine_stack = coroutine_stack  # List of CoroInfo objects
+        self.awaited_by = awaited_by or []  # List of CoroInfo objects (parents)
+
+    def __repr__(self):
+        return f"MockTaskInfo(task_id={self.task_id}, task_name={self.task_name})"
+
+
+class MockAwaitedInfo:
+    """Mock AwaitedInfo for testing async tasks."""
+
+    def __init__(self, thread_id, awaited_by):
+        self.thread_id = thread_id
+        self.awaited_by = awaited_by  # List of TaskInfo objects
+
+    def __repr__(self):
+        return f"MockAwaitedInfo(thread_id={self.thread_id}, awaited_by={len(self.awaited_by)} tasks)"
diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_advanced.py b/Lib/test/test_profiling/test_sampling_profiler/test_advanced.py
index 94946d74aa4..843fb3b7416 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/test_advanced.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_advanced.py
@@ -39,32 +39,26 @@ def setUpClass(cls):
 import gc
 
 class ExpensiveGarbage:
-    """Class that triggers GC with expensive finalizer (callback)."""
     def __init__(self):
         self.cycle = self
 
     def __del__(self):
-        # CPU-intensive work in the finalizer callback
         result = 0
         for i in range(100000):
             result += i * i
             if i % 1000 == 0:
                 result = result % 1000000
 
-def main_loop():
-    """Main loop that triggers GC with expensive callback."""
-    while True:
-        ExpensiveGarbage()
-        gc.collect()
-
-if __name__ == "__main__":
-    main_loop()
+_test_sock.sendall(b"working")
+while True:
+    ExpensiveGarbage()
+    gc.collect()
 '''
 
     def test_gc_frames_enabled(self):
         """Test that GC frames appear when gc tracking is enabled."""
         with (
-            test_subprocess(self.gc_test_script) as subproc,
+            test_subprocess(self.gc_test_script, wait_for_working=True) as subproc,
             io.StringIO() as captured_output,
             mock.patch("sys.stdout", captured_output),
         ):
@@ -94,7 +88,7 @@ def test_gc_frames_enabled(self):
     def test_gc_frames_disabled(self):
         """Test that GC frames do not appear when gc tracking is disabled."""
         with (
-            test_subprocess(self.gc_test_script) as subproc,
+            test_subprocess(self.gc_test_script, wait_for_working=True) as subproc,
             io.StringIO() as captured_output,
             mock.patch("sys.stdout", captured_output),
         ):
@@ -133,18 +127,13 @@ def setUpClass(cls):
         cls.native_test_script = """
 import operator
 
-def main_loop():
-    while True:
-        # Native code in the middle of the stack:
-        operator.call(inner)
-
 def inner():
-    # Python code at the top of the stack:
     for _ in range(1_000_0000):
         pass
 
-if __name__ == "__main__":
-    main_loop()
+_test_sock.sendall(b"working")
+while True:
+    operator.call(inner)
 """
 
     def test_native_frames_enabled(self):
@@ -154,10 +143,7 @@ def test_native_frames_enabled(self):
         )
         self.addCleanup(close_and_unlink, collapsed_file)
 
-        with (
-            test_subprocess(self.native_test_script) as subproc,
-        ):
-            # Suppress profiler output when testing file export
+        with test_subprocess(self.native_test_script, wait_for_working=True) as subproc:
             with (
                 io.StringIO() as captured_output,
                 mock.patch("sys.stdout", captured_output),
@@ -199,7 +185,7 @@ def test_native_frames_enabled(self):
     def test_native_frames_disabled(self):
         """Test that native frames do not appear when native tracking is disabled."""
         with (
-            test_subprocess(self.native_test_script) as subproc,
+            test_subprocess(self.native_test_script, wait_for_working=True) as subproc,
             io.StringIO() as captured_output,
             mock.patch("sys.stdout", captured_output),
         ):
diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_async.py b/Lib/test/test_profiling/test_sampling_profiler/test_async.py
new file mode 100644
index 00000000000..d8ca86c996b
--- /dev/null
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_async.py
@@ -0,0 +1,799 @@
+"""Tests for async stack reconstruction in the sampling profiler.
+
+Each test covers a distinct algorithm path or edge case:
+1. Graph building: _build_task_graph()
+2. Leaf identification: _find_leaf_tasks()
+3. Stack traversal: _build_linear_stacks() with BFS
+"""
+
+import unittest
+
+try:
+    import _remote_debugging  # noqa: F401
+    from profiling.sampling.pstats_collector import PstatsCollector
+except ImportError:
+    raise unittest.SkipTest(
+        "Test only runs when _remote_debugging is available"
+    )
+
+from .mocks import MockFrameInfo, MockCoroInfo, MockTaskInfo, MockAwaitedInfo
+
+
+class TestAsyncStackReconstruction(unittest.TestCase):
+    """Test async task tree linear stack reconstruction algorithm."""
+
+    def test_empty_input(self):
+        """Test _build_task_graph with empty awaited_info_list."""
+        collector = PstatsCollector(sample_interval_usec=1000)
+        stacks = list(collector._iter_async_frames([]))
+        self.assertEqual(len(stacks), 0)
+
+    def test_single_root_task(self):
+        """Test _find_leaf_tasks: root task with no parents is its own leaf."""
+        collector = PstatsCollector(sample_interval_usec=1000)
+
+        root = MockTaskInfo(
+            task_id=123,
+            task_name="Task-1",
+            coroutine_stack=[
+                MockCoroInfo(
+                    task_name="Task-1",
+                    call_stack=[MockFrameInfo("main.py", 10, "main")]
+                )
+            ],
+            awaited_by=[]
+        )
+
+        awaited_info_list = [MockAwaitedInfo(thread_id=100, awaited_by=[root])]
+        stacks = list(collector._iter_async_frames(awaited_info_list))
+
+        # Single root is both leaf and root
+        self.assertEqual(len(stacks), 1)
+        frames, thread_id, leaf_id = stacks[0]
+        self.assertEqual(leaf_id, 123)
+        self.assertEqual(thread_id, 100)
+
+    def test_parent_child_chain(self):
+        """Test _build_linear_stacks: BFS follows parent links from leaf to root.
+
+        Task graph:
+
+            Parent (id=1)
+                |
+            Child (id=2)
+        """
+        collector = PstatsCollector(sample_interval_usec=1000)
+
+        child = MockTaskInfo(
+            task_id=2,
+            task_name="Child",
+            coroutine_stack=[
+                MockCoroInfo(task_name="Child", call_stack=[MockFrameInfo("c.py", 5, "child_fn")])
+            ],
+            awaited_by=[
+                MockCoroInfo(task_name=1, call_stack=[MockFrameInfo("p.py", 10, "parent_await")])
+            ]
+        )
+
+        parent = MockTaskInfo(
+            task_id=1,
+            task_name="Parent",
+            coroutine_stack=[
+                MockCoroInfo(task_name="Parent", call_stack=[MockFrameInfo("p.py", 15, "parent_fn")])
+            ],
+            awaited_by=[]
+        )
+
+        awaited_info_list = [MockAwaitedInfo(thread_id=200, awaited_by=[child, parent])]
+        stacks = list(collector._iter_async_frames(awaited_info_list))
+
+        # Leaf is child, traverses to parent
+        self.assertEqual(len(stacks), 1)
+        frames, thread_id, leaf_id = stacks[0]
+        self.assertEqual(leaf_id, 2)
+
+        # Verify both child and parent frames present
+        func_names = [f.funcname for f in frames]
+        self.assertIn("child_fn", func_names)
+        self.assertIn("parent_fn", func_names)
+
+    def test_multiple_leaf_tasks(self):
+        """Test _find_leaf_tasks: identifies multiple leaves correctly.
+
+        Task graph (fan-out from root):
+
+                Root (id=1)
+                /        \
+          Leaf1 (id=10)  Leaf2 (id=20)
+
+        Expected: 2 stacks (one for each leaf).
+        """
+        collector = PstatsCollector(sample_interval_usec=1000)
+        leaf1 = MockTaskInfo(
+            task_id=10,
+            task_name="Leaf1",
+            coroutine_stack=[MockCoroInfo(task_name="Leaf1", call_stack=[MockFrameInfo("l1.py", 1, "f1")])],
+            awaited_by=[MockCoroInfo(task_name=1, call_stack=[MockFrameInfo("r.py", 5, "root")])]
+        )
+
+        leaf2 = MockTaskInfo(
+            task_id=20,
+            task_name="Leaf2",
+            coroutine_stack=[MockCoroInfo(task_name="Leaf2", call_stack=[MockFrameInfo("l2.py", 2, "f2")])],
+            awaited_by=[MockCoroInfo(task_name=1, call_stack=[MockFrameInfo("r.py", 5, "root")])]
+        )
+
+        root = MockTaskInfo(
+            task_id=1,
+            task_name="Root",
+            coroutine_stack=[MockCoroInfo(task_name="Root", call_stack=[MockFrameInfo("r.py", 10, "main")])],
+            awaited_by=[]
+        )
+
+        awaited_info_list = [MockAwaitedInfo(thread_id=300, awaited_by=[leaf1, leaf2, root])]
+        stacks = list(collector._iter_async_frames(awaited_info_list))
+
+        # Two leaves = two stacks
+        self.assertEqual(len(stacks), 2)
+        leaf_ids = {leaf_id for _, _, leaf_id in stacks}
+        self.assertEqual(leaf_ids, {10, 20})
+
+    def test_cycle_detection(self):
+        """Test _build_linear_stacks: cycle detection prevents infinite loops.
+
+        Task graph (cyclic dependency):
+
+            A (id=1) <---> B (id=2)
+
+        Neither task is a leaf (both have parents), so no stacks are produced.
+        """
+        collector = PstatsCollector(sample_interval_usec=1000)
+        task_a = MockTaskInfo(
+            task_id=1,
+            task_name="A",
+            coroutine_stack=[MockCoroInfo(task_name="A", call_stack=[MockFrameInfo("a.py", 1, "a")])],
+            awaited_by=[MockCoroInfo(task_name=2, call_stack=[MockFrameInfo("b.py", 5, "b")])]
+        )
+
+        task_b = MockTaskInfo(
+            task_id=2,
+            task_name="B",
+            coroutine_stack=[MockCoroInfo(task_name="B", call_stack=[MockFrameInfo("b.py", 10, "b")])],
+            awaited_by=[MockCoroInfo(task_name=1, call_stack=[MockFrameInfo("a.py", 15, "a")])]
+        )
+
+        awaited_info_list = [MockAwaitedInfo(thread_id=400, awaited_by=[task_a, task_b])]
+        stacks = list(collector._iter_async_frames(awaited_info_list))
+
+        # No leaves (both have parents), should return empty
+        self.assertEqual(len(stacks), 0)
+
+    def test_orphaned_parent_reference(self):
+        """Test _build_linear_stacks: handles parent ID not in task_map."""
+        collector = PstatsCollector(sample_interval_usec=1000)
+
+        # Task references non-existent parent
+        orphan = MockTaskInfo(
+            task_id=5,
+            task_name="Orphan",
+            coroutine_stack=[MockCoroInfo(task_name="Orphan", call_stack=[MockFrameInfo("o.py", 1, "orphan")])],
+            awaited_by=[MockCoroInfo(task_name=999, call_stack=[])]  # 999 doesn't exist
+        )
+
+        awaited_info_list = [MockAwaitedInfo(thread_id=500, awaited_by=[orphan])]
+        stacks = list(collector._iter_async_frames(awaited_info_list))
+
+        # Stops at missing parent, yields what it has
+        self.assertEqual(len(stacks), 1)
+        frames, _, leaf_id = stacks[0]
+        self.assertEqual(leaf_id, 5)
+
+    def test_multiple_coroutines_per_task(self):
+        """Test _build_linear_stacks: collects frames from all coroutines in task."""
+        collector = PstatsCollector(sample_interval_usec=1000)
+
+        # Task with multiple coroutines (e.g., nested async generators)
+        task = MockTaskInfo(
+            task_id=7,
+            task_name="Multi",
+            coroutine_stack=[
+                MockCoroInfo(task_name="Multi", call_stack=[MockFrameInfo("g.py", 5, "gen1")]),
+                MockCoroInfo(task_name="Multi", call_stack=[MockFrameInfo("g.py", 10, "gen2")]),
+            ],
+            awaited_by=[]
+        )
+
+        awaited_info_list = [MockAwaitedInfo(thread_id=600, awaited_by=[task])]
+        stacks = list(collector._iter_async_frames(awaited_info_list))
+
+        self.assertEqual(len(stacks), 1)
+        frames, _, _ = stacks[0]
+
+        # Both coroutine frames should be present
+        func_names = [f.funcname for f in frames]
+        self.assertIn("gen1", func_names)
+        self.assertIn("gen2", func_names)
+
+    def test_multiple_threads(self):
+        """Test _build_task_graph: handles multiple AwaitedInfo (different threads)."""
+        collector = PstatsCollector(sample_interval_usec=1000)
+
+        # Two threads with separate task trees
+        thread1_task = MockTaskInfo(
+            task_id=100,
+            task_name="T1",
+            coroutine_stack=[MockCoroInfo(task_name="T1", call_stack=[MockFrameInfo("t1.py", 1, "t1")])],
+            awaited_by=[]
+        )
+
+        thread2_task = MockTaskInfo(
+            task_id=200,
+            task_name="T2",
+            coroutine_stack=[MockCoroInfo(task_name="T2", call_stack=[MockFrameInfo("t2.py", 1, "t2")])],
+            awaited_by=[]
+        )
+
+        awaited_info_list = [
+            MockAwaitedInfo(thread_id=1, awaited_by=[thread1_task]),
+            MockAwaitedInfo(thread_id=2, awaited_by=[thread2_task]),
+        ]
+
+        stacks = list(collector._iter_async_frames(awaited_info_list))
+
+        # Two threads = two stacks
+        self.assertEqual(len(stacks), 2)
+
+        # Verify thread IDs preserved
+        thread_ids = {thread_id for _, thread_id, _ in stacks}
+        self.assertEqual(thread_ids, {1, 2})
+
+    def test_collect_public_interface(self):
+        """Test collect() method correctly routes to async frame processing."""
+        collector = PstatsCollector(sample_interval_usec=1000)
+
+        child = MockTaskInfo(
+            task_id=50,
+            task_name="Child",
+            coroutine_stack=[MockCoroInfo(task_name="Child", call_stack=[MockFrameInfo("c.py", 1, "child")])],
+            awaited_by=[MockCoroInfo(task_name=51, call_stack=[])]
+        )
+
+        parent = MockTaskInfo(
+            task_id=51,
+            task_name="Parent",
+            coroutine_stack=[MockCoroInfo(task_name="Parent", call_stack=[MockFrameInfo("p.py", 1, "parent")])],
+            awaited_by=[]
+        )
+
+        awaited_info_list = [MockAwaitedInfo(thread_id=999, awaited_by=[child, parent])]
+
+        # Public interface: collect()
+        collector.collect(awaited_info_list)
+
+        # Verify stats collected
+        self.assertGreater(len(collector.result), 0)
+        func_names = [loc[2] for loc in collector.result.keys()]
+        self.assertIn("child", func_names)
+        self.assertIn("parent", func_names)
+
+    def test_diamond_pattern_multiple_parents(self):
+        """Test _build_linear_stacks: task with 2+ parents picks one deterministically.
+
+        CRITICAL: Tests that when a task has multiple parents, we pick one parent
+        deterministically (sorted, first one) and annotate the task name with parent count.
+        """
+        collector = PstatsCollector(sample_interval_usec=1000)
+
+        # Diamond pattern: Root spawns A and B, both await Child
+        #
+        #         Root (id=1)
+        #         /        \
+        #     A (id=2)    B (id=3)
+        #         \        /
+        #        Child (id=4)
+        #
+
+        child = MockTaskInfo(
+            task_id=4,
+            task_name="Child",
+            coroutine_stack=[MockCoroInfo(task_name="Child", call_stack=[MockFrameInfo("c.py", 1, "child_work")])],
+            awaited_by=[
+                MockCoroInfo(task_name=2, call_stack=[MockFrameInfo("a.py", 5, "a_await")]),  # Parent A
+                MockCoroInfo(task_name=3, call_stack=[MockFrameInfo("b.py", 5, "b_await")]),  # Parent B
+            ]
+        )
+
+        parent_a = MockTaskInfo(
+            task_id=2,
+            task_name="A",
+            coroutine_stack=[MockCoroInfo(task_name="A", call_stack=[MockFrameInfo("a.py", 10, "a_work")])],
+            awaited_by=[MockCoroInfo(task_name=1, call_stack=[MockFrameInfo("root.py", 5, "root_spawn")])]
+        )
+
+        parent_b = MockTaskInfo(
+            task_id=3,
+            task_name="B",
+            coroutine_stack=[MockCoroInfo(task_name="B", call_stack=[MockFrameInfo("b.py", 10, "b_work")])],
+            awaited_by=[MockCoroInfo(task_name=1, call_stack=[MockFrameInfo("root.py", 5, "root_spawn")])]
+        )
+
+        root = MockTaskInfo(
+            task_id=1,
+            task_name="Root",
+            coroutine_stack=[MockCoroInfo(task_name="Root", call_stack=[MockFrameInfo("root.py", 20, "main")])],
+            awaited_by=[]
+        )
+
+        awaited_info_list = [MockAwaitedInfo(thread_id=777, awaited_by=[child, parent_a, parent_b, root])]
+        stacks = list(collector._iter_async_frames(awaited_info_list))
+
+        # Should get 1 stack: Child->A->Root (picks parent with lowest ID: 2)
+        self.assertEqual(len(stacks), 1, "Diamond should create only 1 path, picking first sorted parent")
+
+        # Verify the single stack
+        frames, thread_id, leaf_id = stacks[0]
+        self.assertEqual(leaf_id, 4)
+        self.assertEqual(thread_id, 777)
+
+        func_names = [f.funcname for f in frames]
+        # Stack should contain child, parent A (id=2, first when sorted), and root
+        self.assertIn("child_work", func_names)
+        self.assertIn("a_work", func_names, "Should use parent A (id=2, first when sorted)")
+        self.assertNotIn("b_work", func_names, "Should not include parent B")
+        self.assertIn("main", func_names)
+
+        # Verify Child task is annotated with parent count
+        self.assertIn("Child (2 parents)", func_names, "Child task should be annotated with parent count")
+
+    def test_empty_coroutine_stack(self):
+        """Test _build_linear_stacks: handles empty coroutine_stack (line 109 condition false)."""
+        collector = PstatsCollector(sample_interval_usec=1000)
+
+        # Task with no coroutine_stack
+        task = MockTaskInfo(
+            task_id=99,
+            task_name="EmptyStack",
+            coroutine_stack=[],  # Empty!
+            awaited_by=[]
+        )
+
+        awaited_info_list = [MockAwaitedInfo(thread_id=111, awaited_by=[task])]
+        stacks = list(collector._iter_async_frames(awaited_info_list))
+
+        self.assertEqual(len(stacks), 1)
+        frames, _, _ = stacks[0]
+
+        # Should only have task marker, no function frames
+        func_names = [f.funcname for f in frames]
+        self.assertEqual(len(func_names), 1, "Should only have task marker")
+        self.assertIn("EmptyStack", func_names)
+
+    def test_orphaned_parent_with_no_frames_collected(self):
+        """Test _build_linear_stacks: orphaned parent at start with empty frames (line 94-96)."""
+        collector = PstatsCollector(sample_interval_usec=1000)
+
+        # Leaf that doesn't exist in task_map (should not happen normally, but test robustness)
+        # We'll create a scenario where the leaf_id is present but empty
+
+        # Task references non-existent parent, and has no coroutine_stack
+        orphan = MockTaskInfo(
+            task_id=88,
+            task_name="Orphan",
+            coroutine_stack=[],  # No frames
+            awaited_by=[MockCoroInfo(task_name=999, call_stack=[])]  # Parent doesn't exist
+        )
+
+        awaited_info_list = [MockAwaitedInfo(thread_id=222, awaited_by=[orphan])]
+        stacks = list(collector._iter_async_frames(awaited_info_list))
+
+        # Should yield because we have the task marker even with no function frames
+        self.assertEqual(len(stacks), 1)
+        frames, _, leaf_id = stacks[0]
+        self.assertEqual(leaf_id, 88)
+        # Has task marker but no function frames
+        self.assertGreater(len(frames), 0, "Should have at least task marker")
+
+    def test_frame_ordering(self):
+        """Test _build_linear_stacks: frames are collected in correct order (leaf->root).
+
+        Task graph (3-level chain):
+
+            Root (id=1)   <- root_bottom, root_top
+                |
+            Middle (id=2) <- mid_bottom, mid_top
+                |
+            Leaf (id=3)   <- leaf_bottom, leaf_top
+
+        Expected frame order: leaf_bottom, leaf_top, mid_bottom, mid_top, root_bottom, root_top
+        (stack is built bottom-up: leaf frames first, then parent frames).
+        """
+        collector = PstatsCollector(sample_interval_usec=1000)
+        leaf = MockTaskInfo(
+            task_id=3,
+            task_name="Leaf",
+            coroutine_stack=[
+                MockCoroInfo(task_name="Leaf", call_stack=[
+                    MockFrameInfo("leaf.py", 1, "leaf_bottom"),
+                    MockFrameInfo("leaf.py", 2, "leaf_top"),
+                ])
+            ],
+            awaited_by=[MockCoroInfo(task_name=2, call_stack=[])]
+        )
+
+        middle = MockTaskInfo(
+            task_id=2,
+            task_name="Middle",
+            coroutine_stack=[
+                MockCoroInfo(task_name="Middle", call_stack=[
+                    MockFrameInfo("mid.py", 1, "mid_bottom"),
+                    MockFrameInfo("mid.py", 2, "mid_top"),
+                ])
+            ],
+            awaited_by=[MockCoroInfo(task_name=1, call_stack=[])]
+        )
+
+        root = MockTaskInfo(
+            task_id=1,
+            task_name="Root",
+            coroutine_stack=[
+                MockCoroInfo(task_name="Root", call_stack=[
+                    MockFrameInfo("root.py", 1, "root_bottom"),
+                    MockFrameInfo("root.py", 2, "root_top"),
+                ])
+            ],
+            awaited_by=[]
+        )
+
+        awaited_info_list = [MockAwaitedInfo(thread_id=333, awaited_by=[leaf, middle, root])]
+        stacks = list(collector._iter_async_frames(awaited_info_list))
+
+        self.assertEqual(len(stacks), 1)
+        frames, _, _ = stacks[0]
+
+        func_names = [f.funcname for f in frames]
+
+        # Order should be: leaf frames, leaf marker, middle frames, middle marker, root frames, root marker
+        leaf_bottom_idx = func_names.index("leaf_bottom")
+        leaf_top_idx = func_names.index("leaf_top")
+        mid_bottom_idx = func_names.index("mid_bottom")
+        root_bottom_idx = func_names.index("root_bottom")
+
+        # Verify leaf comes before middle comes before root
+        self.assertLess(leaf_bottom_idx, leaf_top_idx, "Leaf frames in order")
+        self.assertLess(leaf_top_idx, mid_bottom_idx, "Leaf before middle")
+        self.assertLess(mid_bottom_idx, root_bottom_idx, "Middle before root")
+
+    def test_complex_multi_parent_convergence(self):
+        """Test _build_linear_stacks: multiple leaves with same parents pick deterministically.
+
+        Tests that when multiple leaves have multiple parents, each leaf picks the same
+        parent (sorted, first one) and all leaves are annotated with parent count.
+
+        Task graph structure (both leaves awaited by both A and B)::
+
+                    Root (id=1)
+                    /        \\
+               A (id=2)    B (id=3)
+                  |  \\    /  |
+                  |   \\  /   |
+                  |    \\/    |
+                  |    /\\    |
+                  |   /  \\   |
+            LeafX (id=4)  LeafY (id=5)
+
+        Expected behavior: Both leaves pick parent A (lowest id=2) for their stack path.
+        Result: 2 stacks, both going through A -> Root (B is skipped).
+        """
+        collector = PstatsCollector(sample_interval_usec=1000)
+
+        leaf_x = MockTaskInfo(
+            task_id=4,
+            task_name="LeafX",
+            coroutine_stack=[MockCoroInfo(task_name="LeafX", call_stack=[MockFrameInfo("x.py", 1, "x")])],
+            awaited_by=[
+                MockCoroInfo(task_name=2, call_stack=[]),
+                MockCoroInfo(task_name=3, call_stack=[]),
+            ]
+        )
+
+        leaf_y = MockTaskInfo(
+            task_id=5,
+            task_name="LeafY",
+            coroutine_stack=[MockCoroInfo(task_name="LeafY", call_stack=[MockFrameInfo("y.py", 1, "y")])],
+            awaited_by=[
+                MockCoroInfo(task_name=2, call_stack=[]),
+                MockCoroInfo(task_name=3, call_stack=[]),
+            ]
+        )
+
+        parent_a = MockTaskInfo(
+            task_id=2,
+            task_name="A",
+            coroutine_stack=[MockCoroInfo(task_name="A", call_stack=[MockFrameInfo("a.py", 1, "a")])],
+            awaited_by=[MockCoroInfo(task_name=1, call_stack=[])]
+        )
+
+        parent_b = MockTaskInfo(
+            task_id=3,
+            task_name="B",
+            coroutine_stack=[MockCoroInfo(task_name="B", call_stack=[MockFrameInfo("b.py", 1, "b")])],
+            awaited_by=[MockCoroInfo(task_name=1, call_stack=[])]
+        )
+
+        root = MockTaskInfo(
+            task_id=1,
+            task_name="Root",
+            coroutine_stack=[MockCoroInfo(task_name="Root", call_stack=[MockFrameInfo("r.py", 1, "root")])],
+            awaited_by=[]
+        )
+
+        awaited_info_list = [MockAwaitedInfo(thread_id=444, awaited_by=[leaf_x, leaf_y, parent_a, parent_b, root])]
+        stacks = list(collector._iter_async_frames(awaited_info_list))
+
+        # 2 leaves, each picks same parent (A, id=2) = 2 paths
+        self.assertEqual(len(stacks), 2, "Should create 2 paths: X->A->Root, Y->A->Root")
+
+        # Verify both leaves pick parent A (id=2, first when sorted)
+        leaf_ids_seen = set()
+        for frames, _, leaf_id in stacks:
+            leaf_ids_seen.add(leaf_id)
+            func_names = [f.funcname for f in frames]
+
+            # Both stacks should go through parent A only
+            self.assertIn("a", func_names, "Should use parent A (id=2, first when sorted)")
+            self.assertNotIn("b", func_names, "Should not include parent B")
+            self.assertIn("root", func_names, "Should reach root")
+
+            # Check for parent count annotation on the leaf
+            if leaf_id == 4:
+                self.assertIn("x", func_names)
+                self.assertIn("LeafX (2 parents)", func_names, "LeafX should be annotated with parent count")
+            elif leaf_id == 5:
+                self.assertIn("y", func_names)
+                self.assertIn("LeafY (2 parents)", func_names, "LeafY should be annotated with parent count")
+
+        # Both leaves should be represented
+        self.assertEqual(leaf_ids_seen, {4, 5}, "Both LeafX and LeafY should have paths")
+
+
+class TestFlamegraphCollectorAsync(unittest.TestCase):
+    """Test FlamegraphCollector with async frames."""
+
+    def test_flamegraph_with_async_frames(self):
+        """Test FlamegraphCollector correctly processes async task frames."""
+        from profiling.sampling.stack_collector import FlamegraphCollector
+
+        collector = FlamegraphCollector(sample_interval_usec=1000)
+
+        # Build async task tree: Root -> Child
+        child = MockTaskInfo(
+            task_id=2,
+            task_name="ChildTask",
+            coroutine_stack=[
+                MockCoroInfo(
+                    task_name="ChildTask",
+                    call_stack=[MockFrameInfo("child.py", 10, "child_work")]
+                )
+            ],
+            awaited_by=[MockCoroInfo(task_name=1, call_stack=[])]
+        )
+
+        root = MockTaskInfo(
+            task_id=1,
+            task_name="RootTask",
+            coroutine_stack=[
+                MockCoroInfo(
+                    task_name="RootTask",
+                    call_stack=[MockFrameInfo("root.py", 20, "root_work")]
+                )
+            ],
+            awaited_by=[]
+        )
+
+        awaited_info_list = [MockAwaitedInfo(thread_id=100, awaited_by=[child, root])]
+
+        # Collect async frames
+        collector.collect(awaited_info_list)
+
+        # Verify samples were collected
+        self.assertGreater(collector._total_samples, 0)
+
+        # Verify the flamegraph tree structure contains our functions
+        root_node = collector._root
+        self.assertGreater(root_node["samples"], 0)
+
+        # Check that thread ID was tracked
+        self.assertIn(100, collector._all_threads)
+
+    def test_flamegraph_with_task_markers(self):
+        """Test FlamegraphCollector includes <task> boundary markers."""
+        from profiling.sampling.stack_collector import FlamegraphCollector
+
+        collector = FlamegraphCollector(sample_interval_usec=1000)
+
+        task = MockTaskInfo(
+            task_id=42,
+            task_name="MyTask",
+            coroutine_stack=[
+                MockCoroInfo(
+                    task_name="MyTask",
+                    call_stack=[MockFrameInfo("work.py", 5, "do_work")]
+                )
+            ],
+            awaited_by=[]
+        )
+
+        awaited_info_list = [MockAwaitedInfo(thread_id=200, awaited_by=[task])]
+        collector.collect(awaited_info_list)
+
+        # Find <task> marker in the tree
+        def find_task_marker(node, depth=0):
+            for func, child in node.get("children", {}).items():
+                if func[0] == "<task>":
+                    return func
+                result = find_task_marker(child, depth + 1)
+                if result:
+                    return result
+            return None
+
+        task_marker = find_task_marker(collector._root)
+        self.assertIsNotNone(task_marker, "Should have <task> marker in tree")
+        self.assertEqual(task_marker[0], "<task>")
+        self.assertIn("MyTask", task_marker[2])
+
+    def test_flamegraph_multiple_async_samples(self):
+        """Test FlamegraphCollector aggregates multiple async samples correctly."""
+        from profiling.sampling.stack_collector import FlamegraphCollector
+
+        collector = FlamegraphCollector(sample_interval_usec=1000)
+
+        task = MockTaskInfo(
+            task_id=1,
+            task_name="Task",
+            coroutine_stack=[
+                MockCoroInfo(
+                    task_name="Task",
+                    call_stack=[MockFrameInfo("work.py", 10, "work")]
+                )
+            ],
+            awaited_by=[]
+        )
+
+        awaited_info_list = [MockAwaitedInfo(thread_id=300, awaited_by=[task])]
+
+        # Collect multiple samples
+        for _ in range(5):
+            collector.collect(awaited_info_list)
+
+        # Verify sample count
+        self.assertEqual(collector._sample_count, 5)
+        self.assertEqual(collector._total_samples, 5)
+
+
+class TestAsyncAwareParameterFlow(unittest.TestCase):
+    """Integration tests for async_aware parameter flow from CLI to unwinder."""
+
+    def test_sample_function_accepts_async_aware(self):
+        """Test that sample() function accepts async_aware parameter."""
+        from profiling.sampling.sample import sample
+        import inspect
+
+        sig = inspect.signature(sample)
+        self.assertIn("async_aware", sig.parameters)
+
+    def test_sample_live_function_accepts_async_aware(self):
+        """Test that sample_live() function accepts async_aware parameter."""
+        from profiling.sampling.sample import sample_live
+        import inspect
+
+        sig = inspect.signature(sample_live)
+        self.assertIn("async_aware", sig.parameters)
+
+    def test_sample_profiler_sample_accepts_async_aware(self):
+        """Test that SampleProfiler.sample() accepts async_aware parameter."""
+        from profiling.sampling.sample import SampleProfiler
+        import inspect
+
+        sig = inspect.signature(SampleProfiler.sample)
+        self.assertIn("async_aware", sig.parameters)
+
+    def test_async_aware_all_sees_sleeping_and_running_tasks(self):
+        """Test async_aware='all' captures both sleeping and CPU-running tasks."""
+        # Sleeping task (awaiting)
+        sleeping_task = MockTaskInfo(
+            task_id=1,
+            task_name="SleepingTask",
+            coroutine_stack=[
+                MockCoroInfo(
+                    task_name="SleepingTask",
+                    call_stack=[MockFrameInfo("sleeper.py", 10, "sleep_work")]
+                )
+            ],
+            awaited_by=[]
+        )
+
+        # CPU-running task (active)
+        running_task = MockTaskInfo(
+            task_id=2,
+            task_name="RunningTask",
+            coroutine_stack=[
+                MockCoroInfo(
+                    task_name="RunningTask",
+                    call_stack=[MockFrameInfo("runner.py", 20, "cpu_work")]
+                )
+            ],
+            awaited_by=[]
+        )
+
+        # Both tasks returned by get_all_awaited_by
+        awaited_info_list = [MockAwaitedInfo(thread_id=100, awaited_by=[sleeping_task, running_task])]
+
+        collector = PstatsCollector(sample_interval_usec=1000)
+        collector.collect(awaited_info_list)
+        collector.create_stats()
+
+        # Both tasks should be visible
+        sleeping_key = ("sleeper.py", 10, "sleep_work")
+        running_key = ("runner.py", 20, "cpu_work")
+
+        self.assertIn(sleeping_key, collector.stats)
+        self.assertIn(running_key, collector.stats)
+
+        # Task markers should also be present
+        task_keys = [k for k in collector.stats if k[0] == "<task>"]
+        self.assertGreater(len(task_keys), 0, "Should have <task> markers in stats")
+
+        # Verify task names are in the markers
+        task_names = [k[2] for k in task_keys]
+        self.assertTrue(
+            any("SleepingTask" in name for name in task_names),
+            "SleepingTask should be in task markers"
+        )
+        self.assertTrue(
+            any("RunningTask" in name for name in task_names),
+            "RunningTask should be in task markers"
+        )
+
+    def test_async_aware_running_sees_only_running_task(self):
+        """Test async_aware='running' only shows the currently running task stack."""
+        # Only the running task's stack is returned by get_async_stack_trace
+        running_task = MockTaskInfo(
+            task_id=2,
+            task_name="RunningTask",
+            coroutine_stack=[
+                MockCoroInfo(
+                    task_name="RunningTask",
+                    call_stack=[MockFrameInfo("runner.py", 20, "cpu_work")]
+                )
+            ],
+            awaited_by=[]
+        )
+
+        # get_async_stack_trace only returns the running task
+        awaited_info_list = [MockAwaitedInfo(thread_id=100, awaited_by=[running_task])]
+
+        collector = PstatsCollector(sample_interval_usec=1000)
+        collector.collect(awaited_info_list)
+        collector.create_stats()
+
+        # Only running task should be visible
+        running_key = ("runner.py", 20, "cpu_work")
+        self.assertIn(running_key, collector.stats)
+
+        # Verify we don't see the sleeping task (it wasn't in the input)
+        sleeping_key = ("sleeper.py", 10, "sleep_work")
+        self.assertNotIn(sleeping_key, collector.stats)
+
+        # Task marker for running task should be present
+        task_keys = [k for k in collector.stats if k[0] == "<task>"]
+        self.assertGreater(len(task_keys), 0, "Should have <task> markers in stats")
+
+        task_names = [k[2] for k in task_keys]
+        self.assertTrue(
+            any("RunningTask" in name for name in task_names),
+            "RunningTask should be in task markers"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_cli.py b/Lib/test/test_profiling/test_sampling_profiler/test_cli.py
index 673e1c0d93c..e1892ec9155 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/test_cli.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_cli.py
@@ -547,3 +547,165 @@ def test_sort_options(self):
 
                 mock_sample.assert_called_once()
                 mock_sample.reset_mock()
+
+    def test_async_aware_flag_defaults_to_running(self):
+        """Test --async-aware flag enables async profiling with default 'running' mode."""
+        test_args = ["profiling.sampling.cli", "attach", "12345", "--async-aware"]
+
+        with (
+            mock.patch("sys.argv", test_args),
+            mock.patch("profiling.sampling.cli.sample") as mock_sample,
+        ):
+            from profiling.sampling.cli import main
+            main()
+
+            mock_sample.assert_called_once()
+            # Verify async_aware was passed with default "running" mode
+            call_kwargs = mock_sample.call_args[1]
+            self.assertEqual(call_kwargs.get("async_aware"), "running")
+
+    def test_async_aware_with_async_mode_all(self):
+        """Test --async-aware with --async-mode all."""
+        test_args = ["profiling.sampling.cli", "attach", "12345", "--async-aware", "--async-mode", "all"]
+
+        with (
+            mock.patch("sys.argv", test_args),
+            mock.patch("profiling.sampling.cli.sample") as mock_sample,
+        ):
+            from profiling.sampling.cli import main
+            main()
+
+            mock_sample.assert_called_once()
+            call_kwargs = mock_sample.call_args[1]
+            self.assertEqual(call_kwargs.get("async_aware"), "all")
+
+    def test_async_aware_default_is_none(self):
+        """Test async_aware defaults to None when --async-aware not specified."""
+        test_args = ["profiling.sampling.cli", "attach", "12345"]
+
+        with (
+            mock.patch("sys.argv", test_args),
+            mock.patch("profiling.sampling.cli.sample") as mock_sample,
+        ):
+            from profiling.sampling.cli import main
+            main()
+
+            mock_sample.assert_called_once()
+            call_kwargs = mock_sample.call_args[1]
+            self.assertIsNone(call_kwargs.get("async_aware"))
+
+    def test_async_mode_invalid_choice(self):
+        """Test --async-mode with invalid choice raises error."""
+        test_args = ["profiling.sampling.cli", "attach", "12345", "--async-aware", "--async-mode", "invalid"]
+
+        with (
+            mock.patch("sys.argv", test_args),
+            mock.patch("sys.stderr", io.StringIO()),
+            self.assertRaises(SystemExit) as cm,
+        ):
+            from profiling.sampling.cli import main
+            main()
+
+        self.assertEqual(cm.exception.code, 2)  # argparse error
+
+    def test_async_mode_requires_async_aware(self):
+        """Test --async-mode without --async-aware raises error."""
+        test_args = ["profiling.sampling.cli", "attach", "12345", "--async-mode", "all"]
+
+        with (
+            mock.patch("sys.argv", test_args),
+            mock.patch("sys.stderr", io.StringIO()) as mock_stderr,
+            self.assertRaises(SystemExit) as cm,
+        ):
+            from profiling.sampling.cli import main
+            main()
+
+        self.assertEqual(cm.exception.code, 2)  # argparse error
+        error_msg = mock_stderr.getvalue()
+        self.assertIn("--async-mode requires --async-aware", error_msg)
+
+    def test_async_aware_incompatible_with_native(self):
+        """Test --async-aware is incompatible with --native."""
+        test_args = ["profiling.sampling.cli", "attach", "12345", "--async-aware", "--native"]
+
+        with (
+            mock.patch("sys.argv", test_args),
+            mock.patch("sys.stderr", io.StringIO()) as mock_stderr,
+            self.assertRaises(SystemExit) as cm,
+        ):
+            from profiling.sampling.cli import main
+            main()
+
+        self.assertEqual(cm.exception.code, 2)  # argparse error
+        error_msg = mock_stderr.getvalue()
+        self.assertIn("--native", error_msg)
+        self.assertIn("incompatible with --async-aware", error_msg)
+
+    def test_async_aware_incompatible_with_no_gc(self):
+        """Test --async-aware is incompatible with --no-gc."""
+        test_args = ["profiling.sampling.cli", "attach", "12345", "--async-aware", "--no-gc"]
+
+        with (
+            mock.patch("sys.argv", test_args),
+            mock.patch("sys.stderr", io.StringIO()) as mock_stderr,
+            self.assertRaises(SystemExit) as cm,
+        ):
+            from profiling.sampling.cli import main
+            main()
+
+        self.assertEqual(cm.exception.code, 2)  # argparse error
+        error_msg = mock_stderr.getvalue()
+        self.assertIn("--no-gc", error_msg)
+        self.assertIn("incompatible with --async-aware", error_msg)
+
+    def test_async_aware_incompatible_with_both_native_and_no_gc(self):
+        """Test --async-aware is incompatible with both --native and --no-gc."""
+        test_args = ["profiling.sampling.cli", "attach", "12345", "--async-aware", "--native", "--no-gc"]
+
+        with (
+            mock.patch("sys.argv", test_args),
+            mock.patch("sys.stderr", io.StringIO()) as mock_stderr,
+            self.assertRaises(SystemExit) as cm,
+        ):
+            from profiling.sampling.cli import main
+            main()
+
+        self.assertEqual(cm.exception.code, 2)  # argparse error
+        error_msg = mock_stderr.getvalue()
+        self.assertIn("--native", error_msg)
+        self.assertIn("--no-gc", error_msg)
+        self.assertIn("incompatible with --async-aware", error_msg)
+
+    def test_async_aware_incompatible_with_mode(self):
+        """Test --async-aware is incompatible with --mode (non-wall)."""
+        test_args = ["profiling.sampling.cli", "attach", "12345", "--async-aware", "--mode", "cpu"]
+
+        with (
+            mock.patch("sys.argv", test_args),
+            mock.patch("sys.stderr", io.StringIO()) as mock_stderr,
+            self.assertRaises(SystemExit) as cm,
+        ):
+            from profiling.sampling.cli import main
+            main()
+
+        self.assertEqual(cm.exception.code, 2)  # argparse error
+        error_msg = mock_stderr.getvalue()
+        self.assertIn("--mode=cpu", error_msg)
+        self.assertIn("incompatible with --async-aware", error_msg)
+
+    def test_async_aware_incompatible_with_all_threads(self):
+        """Test --async-aware is incompatible with --all-threads."""
+        test_args = ["profiling.sampling.cli", "attach", "12345", "--async-aware", "--all-threads"]
+
+        with (
+            mock.patch("sys.argv", test_args),
+            mock.patch("sys.stderr", io.StringIO()) as mock_stderr,
+            self.assertRaises(SystemExit) as cm,
+        ):
+            from profiling.sampling.cli import main
+            main()
+
+        self.assertEqual(cm.exception.code, 2)  # argparse error
+        error_msg = mock_stderr.getvalue()
+        self.assertIn("--all-threads", error_msg)
+        self.assertIn("incompatible with --async-aware", error_msg)
diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_integration.py b/Lib/test/test_profiling/test_sampling_profiler/test_integration.py
index e4c5032425d..e92b3f45fbc 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/test_integration.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_integration.py
@@ -39,6 +39,9 @@
 # Duration for profiling tests - long enough for process to complete naturally
 PROFILING_TIMEOUT = str(int(SHORT_TIMEOUT))
 
+# Duration for profiling in tests - short enough to complete quickly
+PROFILING_DURATION_SEC = 2
+
 
 @skip_if_not_supported
 @unittest.skipIf(
@@ -359,23 +362,14 @@ def total_occurrences(func):
         self.assertEqual(total_occurrences(main_key), 2)
 
 
-@requires_subprocess()
-@skip_if_not_supported
-class TestSampleProfilerIntegration(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.test_script = '''
-import time
-import os
-
+# Shared workload functions for test scripts
+_WORKLOAD_FUNCTIONS = '''
 def slow_fibonacci(n):
-    """Recursive fibonacci - should show up prominently in profiler."""
     if n <= 1:
         return n
     return slow_fibonacci(n-1) + slow_fibonacci(n-2)
 
 def cpu_intensive_work():
-    """CPU intensive work that should show in profiler."""
     result = 0
     for i in range(10000):
         result += i * i
@@ -383,33 +377,48 @@ def cpu_intensive_work():
             result = result % 1000000
     return result
 
-def main_loop():
-    """Main test loop."""
-    max_iterations = 200
-
-    for iteration in range(max_iterations):
+def do_work():
+    iteration = 0
+    while True:
         if iteration % 2 == 0:
-            result = slow_fibonacci(15)
+            slow_fibonacci(15)
         else:
-            result = cpu_intensive_work()
+            cpu_intensive_work()
+        iteration += 1
+'''
 
-if __name__ == "__main__":
-    main_loop()
+
+@requires_subprocess()
+@skip_if_not_supported
+class TestSampleProfilerIntegration(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        # Test script for use with test_subprocess() - signals when work starts
+        cls.test_script = _WORKLOAD_FUNCTIONS + '''
+_test_sock.sendall(b"working")
+do_work()
+'''
+        # CLI test script - runs for fixed duration (no socket sync)
+        cls.cli_test_script = '''
+import time
+''' + _WORKLOAD_FUNCTIONS.replace(
+    'while True:', 'end_time = time.time() + 30\n    while time.time() < end_time:'
+) + '''
+do_work()
 '''
 
     def test_sampling_basic_functionality(self):
         with (
-            test_subprocess(self.test_script) as subproc,
+            test_subprocess(self.test_script, wait_for_working=True) as subproc,
             io.StringIO() as captured_output,
             mock.patch("sys.stdout", captured_output),
         ):
             try:
-                # Sample for up to SHORT_TIMEOUT seconds, but process exits after fixed iterations
                 collector = PstatsCollector(sample_interval_usec=1000, skip_idle=False)
                 profiling.sampling.sample.sample(
                     subproc.process.pid,
                     collector,
-                    duration_sec=SHORT_TIMEOUT,
+                    duration_sec=PROFILING_DURATION_SEC,
                 )
                 collector.print_stats(show_summary=False)
             except PermissionError:
@@ -431,7 +440,7 @@ def test_sampling_with_pstats_export(self):
         )
         self.addCleanup(close_and_unlink, pstats_out)
 
-        with test_subprocess(self.test_script) as subproc:
+        with test_subprocess(self.test_script, wait_for_working=True) as subproc:
             # Suppress profiler output when testing file export
             with (
                 io.StringIO() as captured_output,
@@ -442,7 +451,7 @@ def test_sampling_with_pstats_export(self):
                     profiling.sampling.sample.sample(
                         subproc.process.pid,
                         collector,
-                        duration_sec=1,
+                        duration_sec=PROFILING_DURATION_SEC,
                     )
                     collector.export(pstats_out.name)
                 except PermissionError:
@@ -476,7 +485,7 @@ def test_sampling_with_collapsed_export(self):
         self.addCleanup(close_and_unlink, collapsed_file)
 
         with (
-            test_subprocess(self.test_script) as subproc,
+            test_subprocess(self.test_script, wait_for_working=True) as subproc,
         ):
             # Suppress profiler output when testing file export
             with (
@@ -488,7 +497,7 @@ def test_sampling_with_collapsed_export(self):
                     profiling.sampling.sample.sample(
                         subproc.process.pid,
                         collector,
-                        duration_sec=1,
+                        duration_sec=PROFILING_DURATION_SEC,
                     )
                     collector.export(collapsed_file.name)
                 except PermissionError:
@@ -526,7 +535,7 @@ def test_sampling_with_collapsed_export(self):
 
     def test_sampling_all_threads(self):
         with (
-            test_subprocess(self.test_script) as subproc,
+            test_subprocess(self.test_script, wait_for_working=True) as subproc,
             # Suppress profiler output
             io.StringIO() as captured_output,
             mock.patch("sys.stdout", captured_output),
@@ -536,7 +545,7 @@ def test_sampling_all_threads(self):
                 profiling.sampling.sample.sample(
                     subproc.process.pid,
                     collector,
-                    duration_sec=1,
+                    duration_sec=PROFILING_DURATION_SEC,
                     all_threads=True,
                 )
                 collector.print_stats(show_summary=False)
@@ -548,12 +557,16 @@ def test_sampling_all_threads(self):
 
     def test_sample_target_script(self):
         script_file = tempfile.NamedTemporaryFile(delete=False)
-        script_file.write(self.test_script.encode("utf-8"))
+        script_file.write(self.cli_test_script.encode("utf-8"))
         script_file.flush()
         self.addCleanup(close_and_unlink, script_file)
 
-        # Sample for up to SHORT_TIMEOUT seconds, but process exits after fixed iterations
-        test_args = ["profiling.sampling.sample", "run", "-d", PROFILING_TIMEOUT, script_file.name]
+        # Sample for PROFILING_DURATION_SEC seconds
+        test_args = [
+            "profiling.sampling.sample", "run",
+            "-d", str(PROFILING_DURATION_SEC),
+            script_file.name
+        ]
 
         with (
             mock.patch("sys.argv", test_args),
@@ -583,13 +596,13 @@ def test_sample_target_module(self):
         module_path = os.path.join(tempdir.name, "test_module.py")
 
         with open(module_path, "w") as f:
-            f.write(self.test_script)
+            f.write(self.cli_test_script)
 
         test_args = [
             "profiling.sampling.cli",
             "run",
             "-d",
-            PROFILING_TIMEOUT,
+            str(PROFILING_DURATION_SEC),
             "-m",
             "test_module",
         ]
@@ -630,8 +643,10 @@ def test_invalid_pid(self):
             profiling.sampling.sample.sample(-1, collector, duration_sec=1)
 
     def test_process_dies_during_sampling(self):
+        # Use wait_for_working=False since this simple script doesn't send "working"
         with test_subprocess(
-            "import time; time.sleep(0.5); exit()"
+            "import time; time.sleep(0.5); exit()",
+            wait_for_working=False
         ) as subproc:
             with (
                 io.StringIO() as captured_output,
@@ -654,7 +669,11 @@ def test_process_dies_during_sampling(self):
             self.assertIn("Error rate", output)
 
     def test_is_process_running(self):
-        with test_subprocess("import time; time.sleep(1000)") as subproc:
+        # Use wait_for_working=False since this simple script doesn't send "working"
+        with test_subprocess(
+            "import time; time.sleep(1000)",
+            wait_for_working=False
+        ) as subproc:
             try:
                 profiler = SampleProfiler(
                     pid=subproc.process.pid,
@@ -681,7 +700,11 @@ def test_is_process_running(self):
 
     @unittest.skipUnless(sys.platform == "linux", "Only valid on Linux")
     def test_esrch_signal_handling(self):
-        with test_subprocess("import time; time.sleep(1000)") as subproc:
+        # Use wait_for_working=False since this simple script doesn't send "working"
+        with test_subprocess(
+            "import time; time.sleep(1000)",
+            wait_for_working=False
+        ) as subproc:
             try:
                 unwinder = _remote_debugging.RemoteUnwinder(
                     subproc.process.pid
@@ -780,3 +803,124 @@ def test_live_incompatible_with_pstats_default_values(self):
                 from profiling.sampling.cli import main
                 main()
             self.assertNotEqual(cm.exception.code, 0)
+
+
+@requires_subprocess()
+@skip_if_not_supported
+@unittest.skipIf(
+    sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
+    "Test only runs on Linux with process_vm_readv support",
+)
+class TestAsyncAwareProfilingIntegration(unittest.TestCase):
+    """Integration tests for async-aware profiling mode."""
+
+    @classmethod
+    def setUpClass(cls):
+        # Async test script that runs indefinitely until killed.
+        # Sends "working" signal AFTER tasks are created and scheduled.
+        cls.async_script = '''
+import asyncio
+
+async def sleeping_leaf():
+    while True:
+        await asyncio.sleep(0.02)
+
+async def cpu_leaf():
+    total = 0
+    while True:
+        for i in range(10000):
+            total += i * i
+        await asyncio.sleep(0)
+
+async def supervisor():
+    tasks = [
+        asyncio.create_task(sleeping_leaf(), name="Sleeper-0"),
+        asyncio.create_task(sleeping_leaf(), name="Sleeper-1"),
+        asyncio.create_task(sleeping_leaf(), name="Sleeper-2"),
+        asyncio.create_task(cpu_leaf(), name="Worker"),
+    ]
+    await asyncio.sleep(0)  # Let tasks get scheduled
+    _test_sock.sendall(b"working")
+    await asyncio.gather(*tasks)
+
+asyncio.run(supervisor())
+'''
+
+    def _collect_async_samples(self, async_aware_mode):
+        """Helper to collect samples and count function occurrences.
+
+        Returns a dict mapping function names to their sample counts.
+        """
+        with test_subprocess(self.async_script, wait_for_working=True) as subproc:
+            try:
+                collector = CollapsedStackCollector(1000, skip_idle=False)
+                profiling.sampling.sample.sample(
+                    subproc.process.pid,
+                    collector,
+                    duration_sec=PROFILING_DURATION_SEC,
+                    async_aware=async_aware_mode,
+                )
+            except PermissionError:
+                self.skipTest("Insufficient permissions for remote profiling")
+
+        # Count samples per function from collapsed stacks
+        # stack_counter keys are (call_tree, thread_id) where call_tree
+        # is a tuple of (file, line, func) tuples
+        func_samples = {}
+        total = 0
+        for (call_tree, _thread_id), count in collector.stack_counter.items():
+            total += count
+            for _file, _line, func in call_tree:
+                func_samples[func] = func_samples.get(func, 0) + count
+
+        func_samples["_total"] = total
+        return func_samples
+
+    def test_async_aware_all_sees_sleeping_and_running_tasks(self):
+        """Test that async_aware='all' captures both sleeping and CPU-running tasks.
+
+        Task tree structure:
+            main
+              └── supervisor
+                    ├── Sleeper-0 (sleeping_leaf)
+                    ├── Sleeper-1 (sleeping_leaf)
+                    ├── Sleeper-2 (sleeping_leaf)
+                    └── Worker (cpu_leaf)
+
+        async_aware='all' should see ALL 4 leaf tasks in the output.
+        """
+        samples = self._collect_async_samples("all")
+
+        self.assertGreater(samples["_total"], 0, "Should have collected samples")
+        self.assertIn("sleeping_leaf", samples)
+        self.assertIn("cpu_leaf", samples)
+        self.assertIn("supervisor", samples)
+
+    def test_async_aware_running_sees_only_cpu_task(self):
+        """Test that async_aware='running' only captures the actively running task.
+
+        Task tree structure:
+            main
+              └── supervisor
+                    ├── Sleeper-0 (sleeping_leaf) - NOT visible in 'running'
+                    ├── Sleeper-1 (sleeping_leaf) - NOT visible in 'running'
+                    ├── Sleeper-2 (sleeping_leaf) - NOT visible in 'running'
+                    └── Worker (cpu_leaf) - VISIBLE in 'running'
+
+        async_aware='running' should only see the Worker task doing CPU work.
+        """
+        samples = self._collect_async_samples("running")
+
+        total = samples["_total"]
+        cpu_leaf_samples = samples.get("cpu_leaf", 0)
+
+        self.assertGreater(total, 0, "Should have collected some samples")
+        self.assertGreater(cpu_leaf_samples, 0, "cpu_leaf should appear in samples")
+
+        # cpu_leaf should have at least 90% of samples (typically 99%+)
+        # sleeping_leaf may occasionally appear with very few samples (< 1%)
+        # when tasks briefly wake up to check sleep timers
+        cpu_percentage = (cpu_leaf_samples / total) * 100
+        self.assertGreater(cpu_percentage, 90.0,
+            f"cpu_leaf should dominate samples in 'running' mode, "
+            f"got {cpu_percentage:.1f}% ({cpu_leaf_samples}/{total})")
diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py
index 1b0e21a5fe4..c0457ee7eb8 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py
@@ -143,27 +143,16 @@ def cpu_active_worker():
     while True:
         x += 1
 
-def main():
-    # Start both threads
-    idle_thread = threading.Thread(target=idle_worker)
-    cpu_thread = threading.Thread(target=cpu_active_worker)
-    idle_thread.start()
-    cpu_thread.start()
-
-    # Wait for CPU thread to be running, then signal test
-    cpu_ready.wait()
-    _test_sock.sendall(b"threads_ready")
-
-    idle_thread.join()
-    cpu_thread.join()
-
-main()
-
+idle_thread = threading.Thread(target=idle_worker)
+cpu_thread = threading.Thread(target=cpu_active_worker)
+idle_thread.start()
+cpu_thread.start()
+cpu_ready.wait()
+_test_sock.sendall(b"working")
+idle_thread.join()
+cpu_thread.join()
 """
-        with test_subprocess(cpu_vs_idle_script) as subproc:
-            # Wait for signal that threads are running
-            response = subproc.socket.recv(1024)
-            self.assertEqual(response, b"threads_ready")
+        with test_subprocess(cpu_vs_idle_script, wait_for_working=True) as subproc:
 
             with (
                 io.StringIO() as captured_output,
@@ -365,26 +354,16 @@ def gil_holding_work():
     while True:
         x += 1
 
-def main():
-    # Start both threads
-    idle_thread = threading.Thread(target=gil_releasing_work)
-    cpu_thread = threading.Thread(target=gil_holding_work)
-    idle_thread.start()
-    cpu_thread.start()
-
-    # Wait for GIL-holding thread to be running, then signal test
-    gil_ready.wait()
-    _test_sock.sendall(b"threads_ready")
-
-    idle_thread.join()
-    cpu_thread.join()
-
-main()
+idle_thread = threading.Thread(target=gil_releasing_work)
+cpu_thread = threading.Thread(target=gil_holding_work)
+idle_thread.start()
+cpu_thread.start()
+gil_ready.wait()
+_test_sock.sendall(b"working")
+idle_thread.join()
+cpu_thread.join()
 """
-        with test_subprocess(gil_test_script) as subproc:
-            # Wait for signal that threads are running
-            response = subproc.socket.recv(1024)
-            self.assertEqual(response, b"threads_ready")
+        with test_subprocess(gil_test_script, wait_for_working=True) as subproc:
 
             with (
                 io.StringIO() as captured_output,
diff --git a/Lib/test/test_traceback.py b/Lib/test/test_traceback.py
index 3876f1a74bb..d107ad92594 100644
--- a/Lib/test/test_traceback.py
+++ b/Lib/test/test_traceback.py
@@ -1784,6 +1784,23 @@ def test_keyword_suggestions_from_command_string(self):
                 stderr_text = stderr.decode('utf-8')
                 self.assertIn(f"Did you mean '{expected_kw}'", stderr_text)
 
+    def test_no_keyword_suggestion_for_comma_errors(self):
+        # When the parser identifies a missing comma, don't suggest
+        # bogus keyword replacements like 'print' -> 'not'
+        code = '''\
+import sys
+print(
+    "line1"
+    "line2"
+    file=sys.stderr
+)
+'''
+        source = textwrap.dedent(code).strip()
+        rc, stdout, stderr = assert_python_failure('-c', source)
+        stderr_text = stderr.decode('utf-8')
+        self.assertIn("Perhaps you forgot a comma", stderr_text)
+        self.assertNotIn("Did you mean", stderr_text)
+
 @requires_debug_ranges()
 @force_not_colorized_test_class
 class PurePythonTracebackErrorCaretTests(
diff --git a/Lib/traceback.py b/Lib/traceback.py
index 8a3e0f77e76..c1052adeed2 100644
--- a/Lib/traceback.py
+++ b/Lib/traceback.py
@@ -1340,6 +1340,15 @@ def _find_keyword_typos(self):
         if len(error_code) > 1024:
             return
 
+        # If the original code doesn't raise SyntaxError, we can't validate
+        # that a keyword replacement actually fixes anything
+        try:
+            codeop.compile_command(error_code, symbol="exec", flags=codeop.PyCF_ONLY_AST)
+        except SyntaxError:
+            pass  # Good - the original code has a syntax error we might fix
+        else:
+            return  # Original code compiles or is incomplete - can't validate fixes
+
         error_lines = error_code.splitlines()
         tokens = tokenize.generate_tokens(io.StringIO(error_code).readline)
         tokens_left_to_process = 10
diff --git a/Misc/ACKS b/Misc/ACKS
index ab6b8662d8f..e3927ff0b33 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -2119,6 +2119,7 @@ Xiang Zhang
 Robert Xiao
 Florent Xicluna
 Yanbo, Xie
+Kaisheng Xu
 Xinhang Xu
 Arnon Yaari
 Alakshendra Yadav
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-11-19-16-40-24.gh-issue-141732.PTetqp.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-19-16-40-24.gh-issue-141732.PTetqp.rst
new file mode 100644
index 00000000000..08420fd5f4d
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-19-16-40-24.gh-issue-141732.PTetqp.rst
@@ -0,0 +1,2 @@
+Ensure the :meth:`~object.__repr__` for :exc:`ExceptionGroup` and :exc:`BaseExceptionGroup` does
+not change when the exception sequence that was original passed in to its constructor is subsequently mutated.
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-11-24-16-07-57.gh-issue-138122.m3EF9E.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-24-16-07-57.gh-issue-138122.m3EF9E.rst
new file mode 100644
index 00000000000..a4a29e40027
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-24-16-07-57.gh-issue-138122.m3EF9E.rst
@@ -0,0 +1,6 @@
+Add incomplete sample detection to prevent corrupted profiling data. Each
+thread state now contains an embedded base frame (sentinel at the bottom of
+the frame stack) with owner type ``FRAME_OWNED_BY_INTERPRETER``. The profiler
+validates that stack unwinding terminates at this sentinel frame. Samples that
+fail to reach the base frame (due to race conditions, memory corruption, or
+other errors) are now rejected rather than being included as spurious data.
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-06-00-16-43.gh-issue-142236.m3EF9E.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-06-00-16-43.gh-issue-142236.m3EF9E.rst
new file mode 100644
index 00000000000..b5c6a27fd6a
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-06-00-16-43.gh-issue-142236.m3EF9E.rst
@@ -0,0 +1,4 @@
+Fix incorrect keyword suggestions for syntax errors in :mod:`traceback`. The
+keyword typo suggestion mechanism would incorrectly suggest replacements when
+the extracted source code was incomplete rather than containing an actual typo.
+Patch by Pablo Galindo.
diff --git a/Misc/NEWS.d/next/Library/2025-11-14-18-00-41.gh-issue-141565.Ap2bhJ.rst b/Misc/NEWS.d/next/Library/2025-11-14-18-00-41.gh-issue-141565.Ap2bhJ.rst
new file mode 100644
index 00000000000..628f1e0af03
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-11-14-18-00-41.gh-issue-141565.Ap2bhJ.rst
@@ -0,0 +1 @@
+Add async-aware profiling to the Tachyon sampling profiler. The profiler now reconstructs and displays async task hierarchies in flamegraphs, making the output more actionable for users. Patch by Savannah Ostrowski and Pablo Galindo Salgado.
diff --git a/Misc/NEWS.d/next/Library/2025-11-16-04-40-06.gh-issue-69113.Xy7Fmn.rst b/Misc/NEWS.d/next/Library/2025-11-16-04-40-06.gh-issue-69113.Xy7Fmn.rst
new file mode 100644
index 00000000000..cd76ae9b11e
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-11-16-04-40-06.gh-issue-69113.Xy7Fmn.rst
@@ -0,0 +1 @@
+Fix :mod:`doctest` to correctly report line numbers for doctests in ``__test__`` dictionary when formatted as triple-quoted strings by finding unique lines in the string and matching them in the source file.
diff --git a/Misc/NEWS.d/next/Library/2025-11-18-15-48-13.gh-issue-105836.sbUw24.rst b/Misc/NEWS.d/next/Library/2025-11-18-15-48-13.gh-issue-105836.sbUw24.rst
new file mode 100644
index 00000000000..d2edc5b2cb7
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-11-18-15-48-13.gh-issue-105836.sbUw24.rst
@@ -0,0 +1,2 @@
+Fix :meth:`asyncio.run_coroutine_threadsafe` leaving underlying cancelled
+asyncio task running.
diff --git a/Misc/NEWS.d/next/Library/2025-11-27-10-49-13.gh-issue-142006.nzJDG5.rst b/Misc/NEWS.d/next/Library/2025-11-27-10-49-13.gh-issue-142006.nzJDG5.rst
new file mode 100644
index 00000000000..49643892ff9
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-11-27-10-49-13.gh-issue-142006.nzJDG5.rst
@@ -0,0 +1 @@
+Fix a bug in the :mod:`email.policy.default` folding algorithm which incorrectly resulted in a doubled newline when a line ending at exactly max_line_length was followed by an unfoldable token.
diff --git a/Misc/NEWS.d/next/Library/2025-12-01-14-43-58.gh-issue-138122.nRm3ic.rst b/Misc/NEWS.d/next/Library/2025-12-01-14-43-58.gh-issue-138122.nRm3ic.rst
new file mode 100644
index 00000000000..e24fea416ff
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-01-14-43-58.gh-issue-138122.nRm3ic.rst
@@ -0,0 +1,5 @@
+The ``_remote_debugging`` module now implements frame caching in the
+``RemoteUnwinder`` class to reduce memory reads when profiling remote
+processes. When ``cache_frames=True``, unchanged portions of the call stack
+are reused from previous samples, significantly improving profiling
+performance for deep call stacks.
diff --git a/Misc/NEWS.d/next/Library/2025-12-04-09-22-31.gh-issue-68552.I_v-xB.rst b/Misc/NEWS.d/next/Library/2025-12-04-09-22-31.gh-issue-68552.I_v-xB.rst
new file mode 100644
index 00000000000..bd3e53c9f81
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-04-09-22-31.gh-issue-68552.I_v-xB.rst
@@ -0,0 +1 @@
+``MisplacedEnvelopeHeaderDefect`` and ``Missing header name`` defects are now correctly passed to the ``handle_defect`` method of ``policy`` in :class:`~email.parser.FeedParser`.
diff --git a/Misc/NEWS.d/next/Library/2025-12-06-13-02-13.gh-issue-142332.PNvXCV.rst b/Misc/NEWS.d/next/Library/2025-12-06-13-02-13.gh-issue-142332.PNvXCV.rst
new file mode 100644
index 00000000000..ee2d5e1d491
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-06-13-02-13.gh-issue-142332.PNvXCV.rst
@@ -0,0 +1,2 @@
+Fix usage formatting for positional arguments in mutually exclusive groups in :mod:`argparse`.
+in :mod:`argparse`.
diff --git a/Misc/NEWS.d/next/Library/2025-12-06-16-45-34.gh-issue-64532.4OXZpF.rst b/Misc/NEWS.d/next/Library/2025-12-06-16-45-34.gh-issue-64532.4OXZpF.rst
new file mode 100644
index 00000000000..3bd950050ae
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-06-16-45-34.gh-issue-64532.4OXZpF.rst
@@ -0,0 +1 @@
+Subparser help now includes required optional arguments from the parent parser in the usage, making it clearer what arguments are needed to run a subcommand. Patch by Savannah Ostrowski.
diff --git a/Misc/NEWS.d/next/Library/2025-12-07-17-30-05.gh-issue-142346.okcAAp.rst b/Misc/NEWS.d/next/Library/2025-12-07-17-30-05.gh-issue-142346.okcAAp.rst
new file mode 100644
index 00000000000..cf570f314c0
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-07-17-30-05.gh-issue-142346.okcAAp.rst
@@ -0,0 +1,3 @@
+Fix usage formatting for mutually exclusive groups in :mod:`argparse`
+when they are preceded by positional arguments or followed or intermixed
+with other optional arguments.
diff --git a/Modules/Setup.stdlib.in b/Modules/Setup.stdlib.in
index b1582c75bda..1be83b45526 100644
--- a/Modules/Setup.stdlib.in
+++ b/Modules/Setup.stdlib.in
@@ -41,7 +41,7 @@
 @MODULE__PICKLE_TRUE@_pickle _pickle.c
 @MODULE__QUEUE_TRUE@_queue _queuemodule.c
 @MODULE__RANDOM_TRUE@_random _randommodule.c
-@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/threads.c _remote_debugging/asyncio.c
+@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c
 @MODULE__STRUCT_TRUE@_struct _struct.c
 
 # build supports subinterpreters
diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h
index c4547baf967..7f3c0d363f5 100644
--- a/Modules/_remote_debugging/_remote_debugging.h
+++ b/Modules/_remote_debugging/_remote_debugging.h
@@ -154,6 +154,39 @@ typedef struct {
     uintptr_t addr_code_adaptive;
 } CachedCodeMetadata;
 
+/* Frame cache constants and types */
+#define FRAME_CACHE_MAX_THREADS 32
+#define FRAME_CACHE_MAX_FRAMES 1024
+
+typedef struct {
+    uint64_t thread_id;                      // 0 = empty slot
+    uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
+    Py_ssize_t num_addrs;
+    PyObject *frame_list;                    // owned reference, NULL if empty
+} FrameCacheEntry;
+
+/* Statistics for profiling performance analysis */
+typedef struct {
+    uint64_t total_samples;                  // Total number of get_stack_trace calls
+    uint64_t frame_cache_hits;               // Full cache hits (entire stack unchanged)
+    uint64_t frame_cache_misses;             // Cache misses requiring full walk
+    uint64_t frame_cache_partial_hits;       // Partial hits (stopped at cached frame)
+    uint64_t frames_read_from_cache;         // Total frames retrieved from cache
+    uint64_t frames_read_from_memory;        // Total frames read from remote memory
+    uint64_t memory_reads;                   // Total remote memory read operations
+    uint64_t memory_bytes_read;              // Total bytes read from remote memory
+    uint64_t code_object_cache_hits;         // Code object cache hits
+    uint64_t code_object_cache_misses;       // Code object cache misses
+    uint64_t stale_cache_invalidations;      // Times stale entries were cleared
+} UnwinderStats;
+
+/* Stats tracking macros - no-op when stats collection is disabled */
+#define STATS_INC(unwinder, field) \
+    do { if ((unwinder)->collect_stats) (unwinder)->stats.field++; } while(0)
+
+#define STATS_ADD(unwinder, field, val) \
+    do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } while(0)
+
 typedef struct {
     PyTypeObject *RemoteDebugging_Type;
     PyTypeObject *TaskInfo_Type;
@@ -195,7 +228,12 @@ typedef struct {
     int skip_non_matching_threads;
     int native;
     int gc;
+    int cache_frames;
+    int collect_stats;  // whether to collect statistics
+    uint32_t stale_invalidation_counter;  // counter for throttling frame_cache_invalidate_stale
     RemoteDebuggingState *cached_state;
+    FrameCacheEntry *frame_cache;  // preallocated array of FRAME_CACHE_MAX_THREADS entries
+    UnwinderStats stats;  // statistics for performance analysis
 #ifdef Py_GIL_DISABLED
     uint32_t tlbc_generation;
     _Py_hashtable_t *tlbc_cache;
@@ -363,9 +401,46 @@ extern int process_frame_chain(
     uintptr_t initial_frame_addr,
     StackChunkList *chunks,
     PyObject *frame_info,
-    uintptr_t gc_frame
+    uintptr_t base_frame_addr,
+    uintptr_t gc_frame,
+    uintptr_t last_profiled_frame,
+    int *stopped_at_cached_frame,
+    uintptr_t *frame_addrs,
+    Py_ssize_t *num_addrs,
+    Py_ssize_t max_addrs
 );
 
+/* Frame cache functions */
+extern int frame_cache_init(RemoteUnwinderObject *unwinder);
+extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
+extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
+extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
+extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
+extern int frame_cache_lookup_and_extend(
+    RemoteUnwinderObject *unwinder,
+    uint64_t thread_id,
+    uintptr_t last_profiled_frame,
+    PyObject *frame_info,
+    uintptr_t *frame_addrs,
+    Py_ssize_t *num_addrs,
+    Py_ssize_t max_addrs);
+// Returns: 1 = stored, 0 = not stored (graceful), -1 = error
+extern int frame_cache_store(
+    RemoteUnwinderObject *unwinder,
+    uint64_t thread_id,
+    PyObject *frame_list,
+    const uintptr_t *addrs,
+    Py_ssize_t num_addrs);
+
+extern int collect_frames_with_cache(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t frame_addr,
+    StackChunkList *chunks,
+    PyObject *frame_info,
+    uintptr_t gc_frame,
+    uintptr_t last_profiled_frame,
+    uint64_t thread_id);
+
 /* ============================================================================
  * THREAD FUNCTION DECLARATIONS
  * ============================================================================ */
@@ -405,6 +480,7 @@ extern PyObject* unwind_stack_for_thread(
 
 extern uintptr_t _Py_RemoteDebug_GetAsyncioDebugAddress(proc_handle_t* handle);
 extern int read_async_debug(RemoteUnwinderObject *unwinder);
+extern int ensure_async_debug_offsets(RemoteUnwinderObject *unwinder);
 
 /* Task parsing */
 extern PyObject *parse_task_name(RemoteUnwinderObject *unwinder, uintptr_t task_address);
diff --git a/Modules/_remote_debugging/asyncio.c b/Modules/_remote_debugging/asyncio.c
index 8552311b7dc..7f91f16e3a2 100644
--- a/Modules/_remote_debugging/asyncio.c
+++ b/Modules/_remote_debugging/asyncio.c
@@ -71,6 +71,28 @@ read_async_debug(RemoteUnwinderObject *unwinder)
     return result;
 }
 
+int
+ensure_async_debug_offsets(RemoteUnwinderObject *unwinder)
+{
+    // If already available, nothing to do
+    if (unwinder->async_debug_offsets_available) {
+        return 0;
+    }
+
+    // Try to load async debug offsets (the target process may have
+    // loaded asyncio since we last checked)
+    if (read_async_debug(unwinder) < 0) {
+        PyErr_Clear();
+        PyErr_SetString(PyExc_RuntimeError, "AsyncioDebug section not available");
+        set_exception_cause(unwinder, PyExc_RuntimeError,
+            "AsyncioDebug section unavailable - asyncio module may not be loaded in target process");
+        return -1;
+    }
+
+    unwinder->async_debug_offsets_available = 1;
+    return 0;
+}
+
 /* ============================================================================
  * SET ITERATION FUNCTIONS
  * ============================================================================ */
diff --git a/Modules/_remote_debugging/clinic/module.c.h b/Modules/_remote_debugging/clinic/module.c.h
index 60adb357e32..03127b753cc 100644
--- a/Modules/_remote_debugging/clinic/module.c.h
+++ b/Modules/_remote_debugging/clinic/module.c.h
@@ -12,7 +12,7 @@ preserve
 PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
 "RemoteUnwinder(pid, *, all_threads=False, only_active_thread=False,\n"
 "               mode=0, debug=False, skip_non_matching_threads=True,\n"
-"               native=False, gc=False)\n"
+"               native=False, gc=False, cache_frames=False, stats=False)\n"
 "--\n"
 "\n"
 "Initialize a new RemoteUnwinder object for debugging a remote Python process.\n"
@@ -32,6 +32,10 @@ PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
 "            non-Python code.\n"
 "    gc: If True, include artificial \"<GC>\" frames to denote active garbage\n"
 "        collection.\n"
+"    cache_frames: If True, enable frame caching optimization to avoid re-reading\n"
+"                 unchanged parent frames between samples.\n"
+"    stats: If True, collect statistics about cache hits, memory reads, etc.\n"
+"           Use get_stats() to retrieve the collected statistics.\n"
 "\n"
 "The RemoteUnwinder provides functionality to inspect and debug a running Python\n"
 "process, including examining thread states, stack frames and other runtime data.\n"
@@ -48,7 +52,8 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
                                                int only_active_thread,
                                                int mode, int debug,
                                                int skip_non_matching_threads,
-                                               int native, int gc);
+                                               int native, int gc,
+                                               int cache_frames, int stats);
 
 static int
 _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObject *kwargs)
@@ -56,7 +61,7 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
     int return_value = -1;
     #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
 
-    #define NUM_KEYWORDS 8
+    #define NUM_KEYWORDS 10
     static struct {
         PyGC_Head _this_is_not_used;
         PyObject_VAR_HEAD
@@ -65,7 +70,7 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
     } _kwtuple = {
         .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
         .ob_hash = -1,
-        .ob_item = { &_Py_ID(pid), &_Py_ID(all_threads), &_Py_ID(only_active_thread), &_Py_ID(mode), &_Py_ID(debug), &_Py_ID(skip_non_matching_threads), &_Py_ID(native), &_Py_ID(gc), },
+        .ob_item = { &_Py_ID(pid), &_Py_ID(all_threads), &_Py_ID(only_active_thread), &_Py_ID(mode), &_Py_ID(debug), &_Py_ID(skip_non_matching_threads), &_Py_ID(native), &_Py_ID(gc), &_Py_ID(cache_frames), &_Py_ID(stats), },
     };
     #undef NUM_KEYWORDS
     #define KWTUPLE (&_kwtuple.ob_base.ob_base)
@@ -74,14 +79,14 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
     #  define KWTUPLE NULL
     #endif  // !Py_BUILD_CORE
 
-    static const char * const _keywords[] = {"pid", "all_threads", "only_active_thread", "mode", "debug", "skip_non_matching_threads", "native", "gc", NULL};
+    static const char * const _keywords[] = {"pid", "all_threads", "only_active_thread", "mode", "debug", "skip_non_matching_threads", "native", "gc", "cache_frames", "stats", NULL};
     static _PyArg_Parser _parser = {
         .keywords = _keywords,
         .fname = "RemoteUnwinder",
         .kwtuple = KWTUPLE,
     };
     #undef KWTUPLE
-    PyObject *argsbuf[8];
+    PyObject *argsbuf[10];
     PyObject * const *fastargs;
     Py_ssize_t nargs = PyTuple_GET_SIZE(args);
     Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 1;
@@ -93,6 +98,8 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
     int skip_non_matching_threads = 1;
     int native = 0;
     int gc = 0;
+    int cache_frames = 0;
+    int stats = 0;
 
     fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
             /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
@@ -160,12 +167,30 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
             goto skip_optional_kwonly;
         }
     }
-    gc = PyObject_IsTrue(fastargs[7]);
-    if (gc < 0) {
+    if (fastargs[7]) {
+        gc = PyObject_IsTrue(fastargs[7]);
+        if (gc < 0) {
+            goto exit;
+        }
+        if (!--noptargs) {
+            goto skip_optional_kwonly;
+        }
+    }
+    if (fastargs[8]) {
+        cache_frames = PyObject_IsTrue(fastargs[8]);
+        if (cache_frames < 0) {
+            goto exit;
+        }
+        if (!--noptargs) {
+            goto skip_optional_kwonly;
+        }
+    }
+    stats = PyObject_IsTrue(fastargs[9]);
+    if (stats < 0) {
         goto exit;
     }
 skip_optional_kwonly:
-    return_value = _remote_debugging_RemoteUnwinder___init___impl((RemoteUnwinderObject *)self, pid, all_threads, only_active_thread, mode, debug, skip_non_matching_threads, native, gc);
+    return_value = _remote_debugging_RemoteUnwinder___init___impl((RemoteUnwinderObject *)self, pid, all_threads, only_active_thread, mode, debug, skip_non_matching_threads, native, gc, cache_frames, stats);
 
 exit:
     return return_value;
@@ -347,4 +372,51 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace(PyObject *self, PyObject
 
     return return_value;
 }
-/*[clinic end generated code: output=99fed5c94cf36881 input=a9049054013a1b77]*/
+
+PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_stats__doc__,
+"get_stats($self, /)\n"
+"--\n"
+"\n"
+"Get collected statistics about profiling performance.\n"
+"\n"
+"Returns a dictionary containing statistics about cache performance,\n"
+"memory reads, and other profiling metrics. Only available if the\n"
+"RemoteUnwinder was created with stats=True.\n"
+"\n"
+"Returns:\n"
+"    dict: A dictionary containing:\n"
+"        - total_samples: Total number of get_stack_trace calls\n"
+"        - frame_cache_hits: Full cache hits (entire stack unchanged)\n"
+"        - frame_cache_misses: Cache misses requiring full walk\n"
+"        - frame_cache_partial_hits: Partial hits (stopped at cached frame)\n"
+"        - frames_read_from_cache: Total frames retrieved from cache\n"
+"        - frames_read_from_memory: Total frames read from remote memory\n"
+"        - memory_reads: Total remote memory read operations\n"
+"        - memory_bytes_read: Total bytes read from remote memory\n"
+"        - code_object_cache_hits: Code object cache hits\n"
+"        - code_object_cache_misses: Code object cache misses\n"
+"        - stale_cache_invalidations: Times stale cache entries were cleared\n"
+"        - frame_cache_hit_rate: Percentage of samples that hit the cache\n"
+"        - code_object_cache_hit_rate: Percentage of code object lookups that hit cache\n"
+"\n"
+"Raises:\n"
+"    RuntimeError: If stats collection was not enabled (stats=False)");
+
+#define _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STATS_METHODDEF    \
+    {"get_stats", (PyCFunction)_remote_debugging_RemoteUnwinder_get_stats, METH_NOARGS, _remote_debugging_RemoteUnwinder_get_stats__doc__},
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self);
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_stats(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    PyObject *return_value = NULL;
+
+    Py_BEGIN_CRITICAL_SECTION(self);
+    return_value = _remote_debugging_RemoteUnwinder_get_stats_impl((RemoteUnwinderObject *)self);
+    Py_END_CRITICAL_SECTION();
+
+    return return_value;
+}
+/*[clinic end generated code: output=f1fd6c1d4c4c7254 input=a9049054013a1b77]*/
diff --git a/Modules/_remote_debugging/code_objects.c b/Modules/_remote_debugging/code_objects.c
index ea3f00c802b..2cd2505d0f9 100644
--- a/Modules/_remote_debugging/code_objects.c
+++ b/Modules/_remote_debugging/code_objects.c
@@ -257,6 +257,11 @@ parse_code_object(RemoteUnwinderObject *unwinder,
 
     if (unwinder && unwinder->code_object_cache != NULL) {
         meta = _Py_hashtable_get(unwinder->code_object_cache, key);
+        if (meta) {
+            STATS_INC(unwinder, code_object_cache_hits);
+        } else {
+            STATS_INC(unwinder, code_object_cache_misses);
+        }
     }
 
     if (meta == NULL) {
diff --git a/Modules/_remote_debugging/frame_cache.c b/Modules/_remote_debugging/frame_cache.c
new file mode 100644
index 00000000000..4598b9dc353
--- /dev/null
+++ b/Modules/_remote_debugging/frame_cache.c
@@ -0,0 +1,236 @@
+/******************************************************************************
+ * Remote Debugging Module - Frame Cache
+ *
+ * This file contains functions for caching frame information to optimize
+ * repeated stack unwinding for profiling.
+ ******************************************************************************/
+
+#include "_remote_debugging.h"
+
+/* ============================================================================
+ * FRAME CACHE - stores (address, frame_info) pairs per thread
+ * Uses preallocated fixed-size arrays for efficiency and bounded memory.
+ * ============================================================================ */
+
+int
+frame_cache_init(RemoteUnwinderObject *unwinder)
+{
+    unwinder->frame_cache = PyMem_Calloc(FRAME_CACHE_MAX_THREADS, sizeof(FrameCacheEntry));
+    if (!unwinder->frame_cache) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    return 0;
+}
+
+void
+frame_cache_cleanup(RemoteUnwinderObject *unwinder)
+{
+    if (!unwinder->frame_cache) {
+        return;
+    }
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        Py_CLEAR(unwinder->frame_cache[i].frame_list);
+    }
+    PyMem_Free(unwinder->frame_cache);
+    unwinder->frame_cache = NULL;
+}
+
+// Find cache entry by thread_id
+FrameCacheEntry *
+frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id)
+{
+    if (!unwinder->frame_cache || thread_id == 0) {
+        return NULL;
+    }
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        if (unwinder->frame_cache[i].thread_id == thread_id) {
+            return &unwinder->frame_cache[i];
+        }
+    }
+    return NULL;
+}
+
+// Allocate a cache slot for a thread
+// Returns NULL if cache is full (graceful degradation)
+static FrameCacheEntry *
+frame_cache_alloc_slot(RemoteUnwinderObject *unwinder, uint64_t thread_id)
+{
+    if (!unwinder->frame_cache || thread_id == 0) {
+        return NULL;
+    }
+    // First check if thread already has an entry
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        if (unwinder->frame_cache[i].thread_id == thread_id) {
+            return &unwinder->frame_cache[i];
+        }
+    }
+    // Find empty slot
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        if (unwinder->frame_cache[i].thread_id == 0) {
+            return &unwinder->frame_cache[i];
+        }
+    }
+    // Cache full - graceful degradation
+    return NULL;
+}
+
+// Remove cache entries for threads not seen in the result
+// result structure: list of InterpreterInfo, where InterpreterInfo[1] is threads list,
+// and ThreadInfo[0] is the thread_id
+void
+frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
+{
+    if (!unwinder->frame_cache || !result || !PyList_Check(result)) {
+        return;
+    }
+
+    // Build array of seen thread IDs from result
+    uint64_t seen_threads[FRAME_CACHE_MAX_THREADS];
+    int num_seen = 0;
+
+    Py_ssize_t num_interps = PyList_GET_SIZE(result);
+    for (Py_ssize_t i = 0; i < num_interps && num_seen < FRAME_CACHE_MAX_THREADS; i++) {
+        PyObject *interp_info = PyList_GET_ITEM(result, i);
+        PyObject *threads = PyStructSequence_GetItem(interp_info, 1);
+        if (!threads || !PyList_Check(threads)) {
+            continue;
+        }
+        Py_ssize_t num_threads = PyList_GET_SIZE(threads);
+        for (Py_ssize_t j = 0; j < num_threads && num_seen < FRAME_CACHE_MAX_THREADS; j++) {
+            PyObject *thread_info = PyList_GET_ITEM(threads, j);
+            PyObject *tid_obj = PyStructSequence_GetItem(thread_info, 0);
+            if (tid_obj) {
+                uint64_t tid = PyLong_AsUnsignedLongLong(tid_obj);
+                if (!PyErr_Occurred()) {
+                    seen_threads[num_seen++] = tid;
+                } else {
+                    PyErr_Clear();
+                }
+            }
+        }
+    }
+
+    // Invalidate entries not in seen list
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        if (unwinder->frame_cache[i].thread_id == 0) {
+            continue;
+        }
+        int found = 0;
+        for (int j = 0; j < num_seen; j++) {
+            if (unwinder->frame_cache[i].thread_id == seen_threads[j]) {
+                found = 1;
+                break;
+            }
+        }
+        if (!found) {
+            // Clear this entry
+            Py_CLEAR(unwinder->frame_cache[i].frame_list);
+            unwinder->frame_cache[i].thread_id = 0;
+            unwinder->frame_cache[i].num_addrs = 0;
+            STATS_INC(unwinder, stale_cache_invalidations);
+        }
+    }
+}
+
+// Find last_profiled_frame in cache and extend frame_info with cached continuation
+// If frame_addrs is provided (not NULL), also extends it with cached addresses
+int
+frame_cache_lookup_and_extend(
+    RemoteUnwinderObject *unwinder,
+    uint64_t thread_id,
+    uintptr_t last_profiled_frame,
+    PyObject *frame_info,
+    uintptr_t *frame_addrs,
+    Py_ssize_t *num_addrs,
+    Py_ssize_t max_addrs)
+{
+    if (!unwinder->frame_cache || last_profiled_frame == 0) {
+        return 0;
+    }
+
+    FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id);
+    if (!entry || !entry->frame_list) {
+        return 0;
+    }
+
+    // Find the index where last_profiled_frame matches
+    Py_ssize_t start_idx = -1;
+    for (Py_ssize_t i = 0; i < entry->num_addrs; i++) {
+        if (entry->addrs[i] == last_profiled_frame) {
+            start_idx = i;
+            break;
+        }
+    }
+
+    if (start_idx < 0) {
+        return 0;  // Not found
+    }
+
+    Py_ssize_t num_frames = PyList_GET_SIZE(entry->frame_list);
+
+    // Extend frame_info with frames from start_idx onwards
+    PyObject *slice = PyList_GetSlice(entry->frame_list, start_idx, num_frames);
+    if (!slice) {
+        return -1;
+    }
+
+    Py_ssize_t cur_size = PyList_GET_SIZE(frame_info);
+    int result = PyList_SetSlice(frame_info, cur_size, cur_size, slice);
+    Py_DECREF(slice);
+
+    if (result < 0) {
+        return -1;
+    }
+
+    // Also extend frame_addrs with cached addresses if provided
+    if (frame_addrs) {
+        for (Py_ssize_t i = start_idx; i < entry->num_addrs && *num_addrs < max_addrs; i++) {
+            frame_addrs[(*num_addrs)++] = entry->addrs[i];
+        }
+    }
+
+    return 1;
+}
+
+// Store frame list with addresses in cache
+// Returns: 1 = stored successfully, 0 = not stored (graceful degradation), -1 = error
+int
+frame_cache_store(
+    RemoteUnwinderObject *unwinder,
+    uint64_t thread_id,
+    PyObject *frame_list,
+    const uintptr_t *addrs,
+    Py_ssize_t num_addrs)
+{
+    if (!unwinder->frame_cache || thread_id == 0) {
+        return 0;
+    }
+
+    // Clamp to max frames
+    if (num_addrs > FRAME_CACHE_MAX_FRAMES) {
+        num_addrs = FRAME_CACHE_MAX_FRAMES;
+    }
+
+    FrameCacheEntry *entry = frame_cache_alloc_slot(unwinder, thread_id);
+    if (!entry) {
+        // Cache full - graceful degradation
+        return 0;
+    }
+
+    // Clear old frame_list if replacing
+    Py_CLEAR(entry->frame_list);
+
+    // Store full frame list (don't truncate to num_addrs - frames beyond the
+    // address array limit are still valid and needed for full cache hits)
+    Py_ssize_t num_frames = PyList_GET_SIZE(frame_list);
+    entry->frame_list = PyList_GetSlice(frame_list, 0, num_frames);
+    if (!entry->frame_list) {
+        return -1;
+    }
+    entry->thread_id = thread_id;
+    memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
+    entry->num_addrs = num_addrs;
+
+    return 1;
+}
diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c
index d60caadcb9a..eaf3287c6fe 100644
--- a/Modules/_remote_debugging/frames.c
+++ b/Modules/_remote_debugging/frames.c
@@ -154,14 +154,13 @@ is_frame_valid(
 
     void* frame = (void*)frame_addr;
 
-    if (GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) == FRAME_OWNED_BY_INTERPRETER) {
-        return 0;  // C frame
+    char owner = GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner);
+    if (owner == FRAME_OWNED_BY_INTERPRETER) {
+        return 0;  // C frame or sentinel base frame
     }
 
-    if (GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) != FRAME_OWNED_BY_GENERATOR
-        && GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner) != FRAME_OWNED_BY_THREAD) {
-        PyErr_Format(PyExc_RuntimeError, "Unhandled frame owner %d.\n",
-                    GET_MEMBER(char, frame, unwinder->debug_offsets.interpreter_frame.owner));
+    if (owner != FRAME_OWNED_BY_GENERATOR && owner != FRAME_OWNED_BY_THREAD) {
+        PyErr_Format(PyExc_RuntimeError, "Unhandled frame owner %d.\n", owner);
         set_exception_cause(unwinder, PyExc_RuntimeError, "Unhandled frame owner type in async frame");
         return -1;
     }
@@ -189,6 +188,8 @@ parse_frame_object(
         set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
         return -1;
     }
+    STATS_INC(unwinder, memory_reads);
+    STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
 
     *previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
     uintptr_t code_object = GET_MEMBER_NO_TAG(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable);
@@ -258,17 +259,45 @@ process_frame_chain(
     uintptr_t initial_frame_addr,
     StackChunkList *chunks,
     PyObject *frame_info,
-    uintptr_t gc_frame)
+    uintptr_t base_frame_addr,
+    uintptr_t gc_frame,
+    uintptr_t last_profiled_frame,
+    int *stopped_at_cached_frame,
+    uintptr_t *frame_addrs,      // optional: C array to receive frame addresses
+    Py_ssize_t *num_addrs,       // in/out: current count / updated count
+    Py_ssize_t max_addrs)        // max capacity of frame_addrs array
 {
     uintptr_t frame_addr = initial_frame_addr;
     uintptr_t prev_frame_addr = 0;
-    const size_t MAX_FRAMES = 1024;
+    uintptr_t last_frame_addr = 0;  // Track last frame visited for validation
+    const size_t MAX_FRAMES = 1024 + 512;
     size_t frame_count = 0;
 
+    // Initialize output flag
+    if (stopped_at_cached_frame) {
+        *stopped_at_cached_frame = 0;
+    }
+
+    // Quick check: if current_frame == last_profiled_frame, entire stack is unchanged
+    if (last_profiled_frame != 0 && initial_frame_addr == last_profiled_frame) {
+        if (stopped_at_cached_frame) {
+            *stopped_at_cached_frame = 1;
+        }
+        return 0;
+    }
+
     while ((void*)frame_addr != NULL) {
+        // Check if we've reached the cached frame - if so, stop here
+        if (last_profiled_frame != 0 && frame_addr == last_profiled_frame) {
+            if (stopped_at_cached_frame) {
+                *stopped_at_cached_frame = 1;
+            }
+            break;
+        }
         PyObject *frame = NULL;
         uintptr_t next_frame_addr = 0;
         uintptr_t stackpointer = 0;
+        last_frame_addr = frame_addr;  // Remember this frame address
 
         if (++frame_count > MAX_FRAMES) {
             PyErr_SetString(PyExc_RuntimeError, "Too many stack frames (possible infinite loop)");
@@ -276,7 +305,6 @@ process_frame_chain(
             return -1;
         }
 
-        // Try chunks first, fallback to direct memory read
         if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, chunks) < 0) {
             PyErr_Clear();
             uintptr_t address_of_code_object = 0;
@@ -286,7 +314,6 @@ process_frame_chain(
             }
         }
         if (frame == NULL && PyList_GET_SIZE(frame_info) == 0) {
-            // If the first frame is missing, the chain is broken:
             const char *e = "Failed to parse initial frame in chain";
             PyErr_SetString(PyExc_RuntimeError, e);
             return -1;
@@ -310,36 +337,40 @@ process_frame_chain(
             extra_frame = &_Py_STR(native);
         }
         if (extra_frame) {
-            // Use "~" as file and 0 as line, since that's what pstats uses:
             PyObject *extra_frame_info = make_frame_info(
                 unwinder, _Py_LATIN1_CHR('~'), _PyLong_GetZero(), extra_frame);
             if (extra_frame_info == NULL) {
                 return -1;
             }
-            int error = PyList_Append(frame_info, extra_frame_info);
-            Py_DECREF(extra_frame_info);
-            if (error) {
-                const char *e = "Failed to append extra frame to frame info list";
-                set_exception_cause(unwinder, PyExc_RuntimeError, e);
+            if (PyList_Append(frame_info, extra_frame_info) < 0) {
+                Py_DECREF(extra_frame_info);
+                set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to append extra frame");
                 return -1;
             }
+            // Extra frames use 0 as address (they're synthetic)
+            if (frame_addrs && *num_addrs < max_addrs) {
+                frame_addrs[(*num_addrs)++] = 0;
+            }
+            Py_DECREF(extra_frame_info);
         }
         if (frame) {
             if (prev_frame_addr && frame_addr != prev_frame_addr) {
                 const char *f = "Broken frame chain: expected frame at 0x%lx, got 0x%lx";
                 PyErr_Format(PyExc_RuntimeError, f, prev_frame_addr, frame_addr);
                 Py_DECREF(frame);
-                const char *e = "Frame chain consistency check failed";
-                set_exception_cause(unwinder, PyExc_RuntimeError, e);
+                set_exception_cause(unwinder, PyExc_RuntimeError, "Frame chain consistency check failed");
                 return -1;
             }
 
-            if (PyList_Append(frame_info, frame) == -1) {
+            if (PyList_Append(frame_info, frame) < 0) {
                 Py_DECREF(frame);
-                const char *e = "Failed to append frame to frame info list";
-                set_exception_cause(unwinder, PyExc_RuntimeError, e);
+                set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to append frame");
                 return -1;
             }
+            // Track the address for this frame
+            if (frame_addrs && *num_addrs < max_addrs) {
+                frame_addrs[(*num_addrs)++] = frame_addr;
+            }
             Py_DECREF(frame);
         }
 
@@ -347,5 +378,221 @@ process_frame_chain(
         frame_addr = next_frame_addr;
     }
 
+    // Validate we reached the base frame (sentinel at bottom of stack)
+    // Only validate if we walked the full chain (didn't stop at cached frame)
+    // and base_frame_addr is provided (non-zero)
+    int stopped_early = stopped_at_cached_frame && *stopped_at_cached_frame;
+    if (!stopped_early && base_frame_addr != 0 && last_frame_addr != base_frame_addr) {
+        PyErr_Format(PyExc_RuntimeError,
+            "Incomplete sample: did not reach base frame (expected 0x%lx, got 0x%lx)",
+            base_frame_addr, last_frame_addr);
+        return -1;
+    }
+
+    return 0;
+}
+
+// Clear last_profiled_frame for all threads in the target process.
+// This must be called at the start of profiling to avoid stale values
+// from previous profilers causing us to stop frame walking early.
+int
+clear_last_profiled_frames(RemoteUnwinderObject *unwinder)
+{
+    uintptr_t current_interp = unwinder->interpreter_addr;
+    uintptr_t zero = 0;
+
+    while (current_interp != 0) {
+        // Get first thread in this interpreter
+        uintptr_t tstate_addr;
+        if (_Py_RemoteDebug_PagedReadRemoteMemory(
+                &unwinder->handle,
+                current_interp + unwinder->debug_offsets.interpreter_state.threads_head,
+                sizeof(void*),
+                &tstate_addr) < 0) {
+            // Non-fatal: just skip clearing
+            PyErr_Clear();
+            return 0;
+        }
+
+        // Iterate all threads in this interpreter
+        while (tstate_addr != 0) {
+            // Clear last_profiled_frame
+            uintptr_t lpf_addr = tstate_addr + unwinder->debug_offsets.thread_state.last_profiled_frame;
+            if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr,
+                                                  sizeof(uintptr_t), &zero) < 0) {
+                // Non-fatal: just continue
+                PyErr_Clear();
+            }
+
+            // Move to next thread
+            if (_Py_RemoteDebug_PagedReadRemoteMemory(
+                    &unwinder->handle,
+                    tstate_addr + unwinder->debug_offsets.thread_state.next,
+                    sizeof(void*),
+                    &tstate_addr) < 0) {
+                PyErr_Clear();
+                break;
+            }
+        }
+
+        // Move to next interpreter
+        if (_Py_RemoteDebug_PagedReadRemoteMemory(
+                &unwinder->handle,
+                current_interp + unwinder->debug_offsets.interpreter_state.next,
+                sizeof(void*),
+                &current_interp) < 0) {
+            PyErr_Clear();
+            break;
+        }
+    }
+
+    return 0;
+}
+
+// Fast path: check if we have a full cache hit (parent stack unchanged)
+// A "full hit" means current frame == last profiled frame, so we can reuse
+// cached parent frames. We always read the current frame from memory to get
+// updated line numbers (the line within a frame can change between samples).
+// Returns: 1 if full hit (frame_info populated with current frame + cached parents),
+//          0 if miss, -1 on error
+static int
+try_full_cache_hit(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t frame_addr,
+    uintptr_t last_profiled_frame,
+    uint64_t thread_id,
+    PyObject *frame_info)
+{
+    if (!unwinder->frame_cache || last_profiled_frame == 0) {
+        return 0;
+    }
+    // Full hit only if current frame == last profiled frame
+    if (frame_addr != last_profiled_frame) {
+        return 0;
+    }
+
+    FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id);
+    if (!entry || !entry->frame_list) {
+        return 0;
+    }
+
+    // Verify first address matches (sanity check)
+    if (entry->num_addrs == 0 || entry->addrs[0] != frame_addr) {
+        return 0;
+    }
+
+    // Always read the current frame from memory to get updated line number
+    PyObject *current_frame = NULL;
+    uintptr_t code_object_addr = 0;
+    uintptr_t previous_frame = 0;
+    int parse_result = parse_frame_object(unwinder, &current_frame, frame_addr,
+                                          &code_object_addr, &previous_frame);
+    if (parse_result < 0) {
+        return -1;
+    }
+
+    // Get cached parent frames first (before modifying frame_info)
+    Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
+    PyObject *parent_slice = NULL;
+    if (cached_size > 1) {
+        parent_slice = PyList_GetSlice(entry->frame_list, 1, cached_size);
+        if (!parent_slice) {
+            Py_XDECREF(current_frame);
+            return -1;
+        }
+    }
+
+    // Now safe to modify frame_info - add current frame if valid
+    if (current_frame != NULL) {
+        if (PyList_Append(frame_info, current_frame) < 0) {
+            Py_DECREF(current_frame);
+            Py_XDECREF(parent_slice);
+            return -1;
+        }
+        Py_DECREF(current_frame);
+        STATS_ADD(unwinder, frames_read_from_memory, 1);
+    }
+
+    // Extend with cached parent frames
+    if (parent_slice) {
+        Py_ssize_t cur_size = PyList_GET_SIZE(frame_info);
+        int result = PyList_SetSlice(frame_info, cur_size, cur_size, parent_slice);
+        Py_DECREF(parent_slice);
+        if (result < 0) {
+            return -1;
+        }
+        STATS_ADD(unwinder, frames_read_from_cache, cached_size - 1);
+    }
+
+    STATS_INC(unwinder, frame_cache_hits);
+    return 1;
+}
+
+// High-level helper: collect frames with cache optimization
+// Returns complete frame_info list, handling all cache logic internally
+int
+collect_frames_with_cache(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t frame_addr,
+    StackChunkList *chunks,
+    PyObject *frame_info,
+    uintptr_t gc_frame,
+    uintptr_t last_profiled_frame,
+    uint64_t thread_id)
+{
+    // Fast path: check for full cache hit first (no allocations needed)
+    int full_hit = try_full_cache_hit(unwinder, frame_addr, last_profiled_frame,
+                                       thread_id, frame_info);
+    if (full_hit != 0) {
+        return full_hit < 0 ? -1 : 0;  // Either error or success
+    }
+
+    uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
+    Py_ssize_t num_addrs = 0;
+    Py_ssize_t frames_before = PyList_GET_SIZE(frame_info);
+
+    int stopped_at_cached = 0;
+    if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, 0, gc_frame,
+                            last_profiled_frame, &stopped_at_cached,
+                            addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
+        return -1;
+    }
+
+    // Track frames read from memory (frames added by process_frame_chain)
+    STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before);
+
+    // If stopped at cached frame, extend with cached continuation (both frames and addresses)
+    if (stopped_at_cached) {
+        Py_ssize_t frames_before_cache = PyList_GET_SIZE(frame_info);
+        int cache_result = frame_cache_lookup_and_extend(unwinder, thread_id, last_profiled_frame,
+                                                         frame_info, addrs, &num_addrs,
+                                                         FRAME_CACHE_MAX_FRAMES);
+        if (cache_result < 0) {
+            return -1;
+        }
+        if (cache_result == 0) {
+            // Cache miss - continue walking from last_profiled_frame to get the rest
+            STATS_INC(unwinder, frame_cache_misses);
+            Py_ssize_t frames_before_walk = PyList_GET_SIZE(frame_info);
+            if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, 0, gc_frame,
+                                    0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
+                return -1;
+            }
+            STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before_walk);
+        } else {
+            // Partial cache hit
+            STATS_INC(unwinder, frame_cache_partial_hits);
+            STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(frame_info) - frames_before_cache);
+        }
+    } else if (last_profiled_frame == 0) {
+        // No cache involvement (no last_profiled_frame or cache disabled)
+        STATS_INC(unwinder, frame_cache_misses);
+    }
+
+    // Store in cache (frame_cache_store handles truncation if num_addrs > FRAME_CACHE_MAX_FRAMES)
+    if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs) < 0) {
+        return -1;
+    }
+
     return 0;
 }
diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c
index 252291f9162..123e4f5c4d7 100644
--- a/Modules/_remote_debugging/module.c
+++ b/Modules/_remote_debugging/module.c
@@ -235,6 +235,8 @@ _remote_debugging.RemoteUnwinder.__init__
     skip_non_matching_threads: bool = True
     native: bool = False
     gc: bool = False
+    cache_frames: bool = False
+    stats: bool = False
 
 Initialize a new RemoteUnwinder object for debugging a remote Python process.
 
@@ -253,6 +255,10 @@ Initialize a new RemoteUnwinder object for debugging a remote Python process.
             non-Python code.
     gc: If True, include artificial "<GC>" frames to denote active garbage
         collection.
+    cache_frames: If True, enable frame caching optimization to avoid re-reading
+                 unchanged parent frames between samples.
+    stats: If True, collect statistics about cache hits, memory reads, etc.
+           Use get_stats() to retrieve the collected statistics.
 
 The RemoteUnwinder provides functionality to inspect and debug a running Python
 process, including examining thread states, stack frames and other runtime data.
@@ -270,8 +276,9 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
                                                int only_active_thread,
                                                int mode, int debug,
                                                int skip_non_matching_threads,
-                                               int native, int gc)
-/*[clinic end generated code: output=e9eb6b4df119f6e0 input=606d099059207df2]*/
+                                               int native, int gc,
+                                               int cache_frames, int stats)
+/*[clinic end generated code: output=b34ef8cce013c975 input=df2221ef114c3d6a]*/
 {
     // Validate that all_threads and only_active_thread are not both True
     if (all_threads && only_active_thread) {
@@ -283,18 +290,24 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
 #ifdef Py_GIL_DISABLED
     if (only_active_thread) {
         PyErr_SetString(PyExc_ValueError,
-                       "only_active_thread is not supported when Py_GIL_DISABLED is not defined");
+                       "only_active_thread is not supported in free-threaded builds");
         return -1;
     }
 #endif
 
     self->native = native;
     self->gc = gc;
+    self->cache_frames = cache_frames;
+    self->collect_stats = stats;
+    self->stale_invalidation_counter = 0;
     self->debug = debug;
     self->only_active_thread = only_active_thread;
     self->mode = mode;
     self->skip_non_matching_threads = skip_non_matching_threads;
     self->cached_state = NULL;
+    self->frame_cache = NULL;
+    // Initialize stats to zero
+    memset(&self->stats, 0, sizeof(self->stats));
     if (_Py_RemoteDebug_InitProcHandle(&self->handle, pid) < 0) {
         set_exception_cause(self, PyExc_RuntimeError, "Failed to initialize process handle");
         return -1;
@@ -375,6 +388,16 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
     self->win_process_buffer_size = 0;
 #endif
 
+    if (cache_frames && frame_cache_init(self) < 0) {
+        return -1;
+    }
+
+    // Clear stale last_profiled_frame values from previous profilers
+    // This prevents us from stopping frame walking early due to stale values
+    if (cache_frames) {
+        clear_last_profiled_frames(self);
+    }
+
     return 0;
 }
 
@@ -429,6 +452,8 @@ static PyObject *
 _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self)
 /*[clinic end generated code: output=666192b90c69d567 input=bcff01c73cccc1c0]*/
 {
+    STATS_INC(self, total_samples);
+
     PyObject* result = PyList_New(0);
     if (!result) {
         set_exception_cause(self, PyExc_MemoryError, "Failed to create stack trace result list");
@@ -591,7 +616,15 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
     }
 
 exit:
-   _Py_RemoteDebug_ClearCache(&self->handle);
+    // Invalidate cache entries for threads not seen in this sample.
+    // Only do this every 1024 iterations to avoid performance overhead.
+    if (self->cache_frames && result) {
+        if (++self->stale_invalidation_counter >= 1024) {
+            self->stale_invalidation_counter = 0;
+            frame_cache_invalidate_stale(self, result);
+        }
+    }
+    _Py_RemoteDebug_ClearCache(&self->handle);
     return result;
 }
 
@@ -645,9 +678,7 @@ static PyObject *
 _remote_debugging_RemoteUnwinder_get_all_awaited_by_impl(RemoteUnwinderObject *self)
 /*[clinic end generated code: output=6a49cd345e8aec53 input=307f754cbe38250c]*/
 {
-    if (!self->async_debug_offsets_available) {
-        PyErr_SetString(PyExc_RuntimeError, "AsyncioDebug section not available");
-        set_exception_cause(self, PyExc_RuntimeError, "AsyncioDebug section unavailable in get_all_awaited_by");
+    if (ensure_async_debug_offsets(self) < 0) {
         return NULL;
     }
 
@@ -736,9 +767,7 @@ static PyObject *
 _remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject *self)
 /*[clinic end generated code: output=6433d52b55e87bbe input=6129b7d509a887c9]*/
 {
-    if (!self->async_debug_offsets_available) {
-        PyErr_SetString(PyExc_RuntimeError, "AsyncioDebug section not available");
-        set_exception_cause(self, PyExc_RuntimeError, "AsyncioDebug section unavailable in get_async_stack_trace");
+    if (ensure_async_debug_offsets(self) < 0) {
         return NULL;
     }
 
@@ -761,10 +790,114 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject
     return NULL;
 }
 
+/*[clinic input]
+@permit_long_docstring_body
+@critical_section
+_remote_debugging.RemoteUnwinder.get_stats
+
+Get collected statistics about profiling performance.
+
+Returns a dictionary containing statistics about cache performance,
+memory reads, and other profiling metrics. Only available if the
+RemoteUnwinder was created with stats=True.
+
+Returns:
+    dict: A dictionary containing:
+        - total_samples: Total number of get_stack_trace calls
+        - frame_cache_hits: Full cache hits (entire stack unchanged)
+        - frame_cache_misses: Cache misses requiring full walk
+        - frame_cache_partial_hits: Partial hits (stopped at cached frame)
+        - frames_read_from_cache: Total frames retrieved from cache
+        - frames_read_from_memory: Total frames read from remote memory
+        - memory_reads: Total remote memory read operations
+        - memory_bytes_read: Total bytes read from remote memory
+        - code_object_cache_hits: Code object cache hits
+        - code_object_cache_misses: Code object cache misses
+        - stale_cache_invalidations: Times stale cache entries were cleared
+        - frame_cache_hit_rate: Percentage of samples that hit the cache
+        - code_object_cache_hit_rate: Percentage of code object lookups that hit cache
+
+Raises:
+    RuntimeError: If stats collection was not enabled (stats=False)
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
+/*[clinic end generated code: output=21e36477122be2a0 input=75fef4134c12a8c9]*/
+{
+    if (!self->collect_stats) {
+        PyErr_SetString(PyExc_RuntimeError,
+                       "Statistics collection was not enabled. "
+                       "Create RemoteUnwinder with stats=True to collect statistics.");
+        return NULL;
+    }
+
+    PyObject *result = PyDict_New();
+    if (!result) {
+        return NULL;
+    }
+
+#define ADD_STAT(name) do { \
+    PyObject *val = PyLong_FromUnsignedLongLong(self->stats.name); \
+    if (!val || PyDict_SetItemString(result, #name, val) < 0) { \
+        Py_XDECREF(val); \
+        Py_DECREF(result); \
+        return NULL; \
+    } \
+    Py_DECREF(val); \
+} while(0)
+
+    ADD_STAT(total_samples);
+    ADD_STAT(frame_cache_hits);
+    ADD_STAT(frame_cache_misses);
+    ADD_STAT(frame_cache_partial_hits);
+    ADD_STAT(frames_read_from_cache);
+    ADD_STAT(frames_read_from_memory);
+    ADD_STAT(memory_reads);
+    ADD_STAT(memory_bytes_read);
+    ADD_STAT(code_object_cache_hits);
+    ADD_STAT(code_object_cache_misses);
+    ADD_STAT(stale_cache_invalidations);
+
+#undef ADD_STAT
+
+    // Calculate and add derived statistics
+    // Hit rate is calculated as (hits + partial_hits) / total_cache_lookups
+    double frame_cache_hit_rate = 0.0;
+    uint64_t total_cache_lookups = self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits + self->stats.frame_cache_misses;
+    if (total_cache_lookups > 0) {
+        frame_cache_hit_rate = 100.0 * (double)(self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits)
+                               / (double)total_cache_lookups;
+    }
+    PyObject *hit_rate = PyFloat_FromDouble(frame_cache_hit_rate);
+    if (!hit_rate || PyDict_SetItemString(result, "frame_cache_hit_rate", hit_rate) < 0) {
+        Py_XDECREF(hit_rate);
+        Py_DECREF(result);
+        return NULL;
+    }
+    Py_DECREF(hit_rate);
+
+    double code_object_hit_rate = 0.0;
+    uint64_t total_code_lookups = self->stats.code_object_cache_hits + self->stats.code_object_cache_misses;
+    if (total_code_lookups > 0) {
+        code_object_hit_rate = 100.0 * (double)self->stats.code_object_cache_hits / (double)total_code_lookups;
+    }
+    PyObject *code_hit_rate = PyFloat_FromDouble(code_object_hit_rate);
+    if (!code_hit_rate || PyDict_SetItemString(result, "code_object_cache_hit_rate", code_hit_rate) < 0) {
+        Py_XDECREF(code_hit_rate);
+        Py_DECREF(result);
+        return NULL;
+    }
+    Py_DECREF(code_hit_rate);
+
+    return result;
+}
+
 static PyMethodDef RemoteUnwinder_methods[] = {
     _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STACK_TRACE_METHODDEF
     _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ALL_AWAITED_BY_METHODDEF
     _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_ASYNC_STACK_TRACE_METHODDEF
+    _REMOTE_DEBUGGING_REMOTEUNWINDER_GET_STATS_METHODDEF
     {NULL, NULL}
 };
 
@@ -791,6 +924,7 @@ RemoteUnwinder_dealloc(PyObject *op)
         _Py_RemoteDebug_ClearCache(&self->handle);
         _Py_RemoteDebug_CleanupProcHandle(&self->handle);
     }
+    frame_cache_cleanup(self);
     PyObject_Del(self);
     Py_DECREF(tp);
 }
diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c
index 99147b01a1b..69819eb8dcd 100644
--- a/Modules/_remote_debugging/threads.c
+++ b/Modules/_remote_debugging/threads.c
@@ -296,6 +296,8 @@ unwind_stack_for_thread(
         set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
         goto error;
     }
+    STATS_INC(unwinder, memory_reads);
+    STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.thread_state.size);
 
     long tid = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id);
 
@@ -309,6 +311,8 @@ unwind_stack_for_thread(
         set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read GC state");
         goto error;
     }
+    STATS_INC(unwinder, memory_reads);
+    STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.gc.size);
 
     // Calculate thread status using flags (always)
     int status_flags = 0;
@@ -376,6 +380,7 @@ unwind_stack_for_thread(
     }
 
     uintptr_t frame_addr = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.current_frame);
+    uintptr_t base_frame_addr = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.base_frame);
 
     frame_info = PyList_New(0);
     if (!frame_info) {
@@ -383,14 +388,36 @@ unwind_stack_for_thread(
         goto error;
     }
 
-    if (copy_stack_chunks(unwinder, *current_tstate, &chunks) < 0) {
-        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to copy stack chunks");
-        goto error;
+    // In cache mode, copying stack chunks is more expensive than direct memory reads
+    if (!unwinder->cache_frames) {
+        if (copy_stack_chunks(unwinder, *current_tstate, &chunks) < 0) {
+            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to copy stack chunks");
+            goto error;
+        }
     }
 
-    if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info, gc_frame) < 0) {
-        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain");
-        goto error;
+    if (unwinder->cache_frames) {
+        // Use cache to avoid re-reading unchanged parent frames
+        uintptr_t last_profiled_frame = GET_MEMBER(uintptr_t, ts,
+            unwinder->debug_offsets.thread_state.last_profiled_frame);
+        if (collect_frames_with_cache(unwinder, frame_addr, &chunks, frame_info,
+                                      gc_frame, last_profiled_frame, tid) < 0) {
+            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames");
+            goto error;
+        }
+        // Update last_profiled_frame for next sample
+        uintptr_t lpf_addr = *current_tstate + unwinder->debug_offsets.thread_state.last_profiled_frame;
+        if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr,
+                                              sizeof(uintptr_t), &frame_addr) < 0) {
+            PyErr_Clear();  // Non-fatal
+        }
+    } else {
+        // No caching - process entire frame chain with base_frame validation
+        if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info,
+                                base_frame_addr, gc_frame, 0, NULL, NULL, NULL, 0) < 0) {
+            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain");
+            goto error;
+        }
     }
 
     *current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next);
diff --git a/Objects/exceptions.c b/Objects/exceptions.c
index cebbed1fa68..7b4b63e9a4a 100644
--- a/Objects/exceptions.c
+++ b/Objects/exceptions.c
@@ -694,12 +694,12 @@ PyTypeObject _PyExc_ ## EXCNAME = { \
 
 #define ComplexExtendsException(EXCBASE, EXCNAME, EXCSTORE, EXCNEW, \
                                 EXCMETHODS, EXCMEMBERS, EXCGETSET, \
-                                EXCSTR, EXCDOC) \
+                                EXCSTR, EXCREPR, EXCDOC) \
 static PyTypeObject _PyExc_ ## EXCNAME = { \
     PyVarObject_HEAD_INIT(NULL, 0) \
     # EXCNAME, \
     sizeof(Py ## EXCSTORE ## Object), 0, \
-    EXCSTORE ## _dealloc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+    EXCSTORE ## _dealloc, 0, 0, 0, 0, EXCREPR, 0, 0, 0, 0, 0, \
     EXCSTR, 0, 0, 0, \
     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, \
     PyDoc_STR(EXCDOC), EXCSTORE ## _traverse, \
@@ -792,7 +792,7 @@ StopIteration_traverse(PyObject *op, visitproc visit, void *arg)
 }
 
 ComplexExtendsException(PyExc_Exception, StopIteration, StopIteration,
-                        0, 0, StopIteration_members, 0, 0,
+                        0, 0, StopIteration_members, 0, 0, 0,
                         "Signal the end from iterator.__next__().");
 
 
@@ -865,7 +865,7 @@ static PyMemberDef SystemExit_members[] = {
 };
 
 ComplexExtendsException(PyExc_BaseException, SystemExit, SystemExit,
-                        0, 0, SystemExit_members, 0, 0,
+                        0, 0, SystemExit_members, 0, 0, 0,
                         "Request to exit from the interpreter.");
 
 /*
@@ -890,6 +890,7 @@ BaseExceptionGroup_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 
     PyObject *message = NULL;
     PyObject *exceptions = NULL;
+    PyObject *exceptions_str = NULL;
 
     if (!PyArg_ParseTuple(args,
                           "UO:BaseExceptionGroup.__new__",
@@ -905,6 +906,18 @@ BaseExceptionGroup_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
         return NULL;
     }
 
+    /* Save initial exceptions sequence as a string in case sequence is mutated */
+    if (!PyList_Check(exceptions) && !PyTuple_Check(exceptions)) {
+        exceptions_str = PyObject_Repr(exceptions);
+        if (exceptions_str == NULL) {
+            /* We don't hold a reference to exceptions, so clear it before
+             * attempting a decref in the cleanup.
+             */
+            exceptions = NULL;
+            goto error;
+        }
+    }
+
     exceptions = PySequence_Tuple(exceptions);
     if (!exceptions) {
         return NULL;
@@ -988,9 +1001,11 @@ BaseExceptionGroup_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 
     self->msg = Py_NewRef(message);
     self->excs = exceptions;
+    self->excs_str = exceptions_str;
     return (PyObject*)self;
 error:
-    Py_DECREF(exceptions);
+    Py_XDECREF(exceptions);
+    Py_XDECREF(exceptions_str);
     return NULL;
 }
 
@@ -1029,6 +1044,7 @@ BaseExceptionGroup_clear(PyObject *op)
     PyBaseExceptionGroupObject *self = PyBaseExceptionGroupObject_CAST(op);
     Py_CLEAR(self->msg);
     Py_CLEAR(self->excs);
+    Py_CLEAR(self->excs_str);
     return BaseException_clear(op);
 }
 
@@ -1046,6 +1062,7 @@ BaseExceptionGroup_traverse(PyObject *op, visitproc visit, void *arg)
     PyBaseExceptionGroupObject *self = PyBaseExceptionGroupObject_CAST(op);
     Py_VISIT(self->msg);
     Py_VISIT(self->excs);
+    Py_VISIT(self->excs_str);
     return BaseException_traverse(op, visit, arg);
 }
 
@@ -1063,6 +1080,54 @@ BaseExceptionGroup_str(PyObject *op)
         self->msg, num_excs, num_excs > 1 ? "s" : "");
 }
 
+static PyObject *
+BaseExceptionGroup_repr(PyObject *op)
+{
+    PyBaseExceptionGroupObject *self = PyBaseExceptionGroupObject_CAST(op);
+    assert(self->msg);
+
+    PyObject *exceptions_str = NULL;
+
+    /* Use the saved exceptions string for custom sequences. */
+    if (self->excs_str) {
+        exceptions_str = Py_NewRef(self->excs_str);
+    }
+    else {
+        assert(self->excs);
+
+        /* Older versions delegated to BaseException, inserting the current
+         * value of self.args[1]; but this can be mutable and go out-of-sync
+         * with self.exceptions. Instead, use self.exceptions for accuracy,
+         * making it look like self.args[1] for backwards compatibility. */
+        if (PyList_Check(PyTuple_GET_ITEM(self->args, 1))) {
+            PyObject *exceptions_list = PySequence_List(self->excs);
+            if (!exceptions_list) {
+                return NULL;
+            }
+
+            exceptions_str = PyObject_Repr(exceptions_list);
+            Py_DECREF(exceptions_list);
+        }
+        else {
+            exceptions_str = PyObject_Repr(self->excs);
+        }
+
+        if (!exceptions_str) {
+            return NULL;
+        }
+    }
+
+    assert(exceptions_str != NULL);
+
+    const char *name = _PyType_Name(Py_TYPE(self));
+    PyObject *repr = PyUnicode_FromFormat(
+        "%s(%R, %U)", name,
+        self->msg, exceptions_str);
+
+    Py_DECREF(exceptions_str);
+    return repr;
+}
+
 /*[clinic input]
 @critical_section
 BaseExceptionGroup.derive
@@ -1697,7 +1762,7 @@ static PyMethodDef BaseExceptionGroup_methods[] = {
 ComplexExtendsException(PyExc_BaseException, BaseExceptionGroup,
     BaseExceptionGroup, BaseExceptionGroup_new /* new */,
     BaseExceptionGroup_methods, BaseExceptionGroup_members,
-    0 /* getset */, BaseExceptionGroup_str,
+    0 /* getset */, BaseExceptionGroup_str, BaseExceptionGroup_repr,
     "A combination of multiple unrelated exceptions.");
 
 /*
@@ -2431,7 +2496,7 @@ static PyGetSetDef OSError_getset[] = {
 ComplexExtendsException(PyExc_Exception, OSError,
                         OSError, OSError_new,
                         OSError_methods, OSError_members, OSError_getset,
-                        OSError_str,
+                        OSError_str, 0,
                         "Base class for I/O related errors.");
 
 
@@ -2572,7 +2637,7 @@ static PyMethodDef NameError_methods[] = {
 ComplexExtendsException(PyExc_Exception, NameError,
                         NameError, 0,
                         NameError_methods, NameError_members,
-                        0, BaseException_str, "Name not found globally.");
+                        0, BaseException_str, 0, "Name not found globally.");
 
 /*
  *    UnboundLocalError extends NameError
@@ -2706,7 +2771,7 @@ static PyMethodDef AttributeError_methods[] = {
 ComplexExtendsException(PyExc_Exception, AttributeError,
                         AttributeError, 0,
                         AttributeError_methods, AttributeError_members,
-                        0, BaseException_str, "Attribute not found.");
+                        0, BaseException_str, 0, "Attribute not found.");
 
 /*
  *    SyntaxError extends Exception
@@ -2905,7 +2970,7 @@ static PyMemberDef SyntaxError_members[] = {
 
 ComplexExtendsException(PyExc_Exception, SyntaxError, SyntaxError,
                         0, 0, SyntaxError_members, 0,
-                        SyntaxError_str, "Invalid syntax.");
+                        SyntaxError_str, 0, "Invalid syntax.");
 
 
 /*
@@ -2965,7 +3030,7 @@ KeyError_str(PyObject *op)
 }
 
 ComplexExtendsException(PyExc_LookupError, KeyError, BaseException,
-                        0, 0, 0, 0, KeyError_str, "Mapping key not found.");
+                        0, 0, 0, 0, KeyError_str, 0, "Mapping key not found.");
 
 
 /*
diff --git a/PCbuild/_remote_debugging.vcxproj b/PCbuild/_remote_debugging.vcxproj
index 3ef34ef0563..c91c9cf3652 100644
--- a/PCbuild/_remote_debugging.vcxproj
+++ b/PCbuild/_remote_debugging.vcxproj
@@ -102,6 +102,7 @@
     <ClCompile Include="..\Modules\_remote_debugging\object_reading.c" />
     <ClCompile Include="..\Modules\_remote_debugging\code_objects.c" />
     <ClCompile Include="..\Modules\_remote_debugging\frames.c" />
+    <ClCompile Include="..\Modules\_remote_debugging\frame_cache.c" />
     <ClCompile Include="..\Modules\_remote_debugging\threads.c" />
     <ClCompile Include="..\Modules\_remote_debugging\asyncio.c" />
   </ItemGroup>
diff --git a/PCbuild/_remote_debugging.vcxproj.filters b/PCbuild/_remote_debugging.vcxproj.filters
index 5c117a79f3b..b37a2c5575c 100644
--- a/PCbuild/_remote_debugging.vcxproj.filters
+++ b/PCbuild/_remote_debugging.vcxproj.filters
@@ -24,6 +24,9 @@
     <ClCompile Include="..\Modules\_remote_debugging\frames.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\Modules\_remote_debugging\frame_cache.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\Modules\_remote_debugging\threads.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/Python/ceval.c b/Python/ceval.c
index c4e1dbc8bcf..7d9101861fa 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -2289,6 +2289,16 @@ clear_gen_frame(PyThreadState *tstate, _PyInterpreterFrame * frame)
 void
 _PyEval_FrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame * frame)
 {
+    // Update last_profiled_frame for remote profiler frame caching.
+    // By this point, tstate->current_frame is already set to the parent frame.
+    // Only update if we're popping the exact frame that was last profiled.
+    // This avoids corrupting the cache when transient frames (called and returned
+    // between profiler samples) update last_profiled_frame to addresses the
+    // profiler never saw.
+    if (tstate->last_profiled_frame != NULL && tstate->last_profiled_frame == frame) {
+        tstate->last_profiled_frame = tstate->current_frame;
+    }
+
     if (frame->owner == FRAME_OWNED_BY_THREAD) {
         clear_thread_frame(tstate, frame);
     }
@@ -2345,7 +2355,7 @@ _PyEvalFramePushAndInit_Ex(PyThreadState *tstate, _PyStackRef func,
     PyObject *kwnames = NULL;
     _PyStackRef *newargs;
     PyObject *const *object_array = NULL;
-    _PyStackRef stack_array[8];
+    _PyStackRef stack_array[8] = {0};
     if (has_dict) {
         object_array = _PyStack_UnpackDict(tstate, _PyTuple_ITEMS(callargs), nargs, kwargs, &kwnames);
         if (object_array == NULL) {
@@ -2408,7 +2418,7 @@ _PyEval_Vector(PyThreadState *tstate, PyFunctionObject *func,
     if (kwnames) {
         total_args += PyTuple_GET_SIZE(kwnames);
     }
-    _PyStackRef stack_array[8];
+    _PyStackRef stack_array[8] = {0};
     _PyStackRef *arguments;
     if (total_args <= 8) {
         arguments = stack_array;
@@ -3343,6 +3353,9 @@ PyEval_MergeCompilerFlags(PyCompilerFlags *cf)
 {
     PyThreadState *tstate = _PyThreadState_GET();
     _PyInterpreterFrame *current_frame = tstate->current_frame;
+    if (current_frame == tstate->base_frame) {
+        current_frame = NULL;
+    }
     int result = cf->cf_flags != 0;
 
     if (current_frame != NULL) {
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index 2045d3710ef..96ac33a4b25 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -2586,7 +2586,7 @@ Py_EndInterpreter(PyThreadState *tstate)
     if (tstate != _PyThreadState_GET()) {
         Py_FatalError("thread is not current");
     }
-    if (tstate->current_frame != NULL) {
+    if (tstate->current_frame != tstate->base_frame) {
         Py_FatalError("thread still has a frame");
     }
     interp->finalizing = 1;
diff --git a/Python/pystate.c b/Python/pystate.c
index c12a1418e74..2956e785405 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -1482,7 +1482,31 @@ init_threadstate(_PyThreadStateImpl *_tstate,
     // This is cleared when PyGILState_Ensure() creates the thread state.
     tstate->gilstate_counter = 1;
 
-    tstate->current_frame = NULL;
+    // Initialize the embedded base frame - sentinel at the bottom of the frame stack
+    _tstate->base_frame.previous = NULL;
+    _tstate->base_frame.f_executable = PyStackRef_None;
+    _tstate->base_frame.f_funcobj = PyStackRef_NULL;
+    _tstate->base_frame.f_globals = NULL;
+    _tstate->base_frame.f_builtins = NULL;
+    _tstate->base_frame.f_locals = NULL;
+    _tstate->base_frame.frame_obj = NULL;
+    _tstate->base_frame.instr_ptr = NULL;
+    _tstate->base_frame.stackpointer = _tstate->base_frame.localsplus;
+    _tstate->base_frame.return_offset = 0;
+    _tstate->base_frame.owner = FRAME_OWNED_BY_INTERPRETER;
+    _tstate->base_frame.visited = 0;
+#ifdef Py_DEBUG
+    _tstate->base_frame.lltrace = 0;
+#endif
+#ifdef Py_GIL_DISABLED
+    _tstate->base_frame.tlbc_index = 0;
+#endif
+    _tstate->base_frame.localsplus[0] = PyStackRef_NULL;
+
+    // current_frame starts pointing to the base frame
+    tstate->current_frame = &_tstate->base_frame;
+    // base_frame pointer for profilers to validate stack unwinding
+    tstate->base_frame = &_tstate->base_frame;
     tstate->datastack_chunk = NULL;
     tstate->datastack_top = NULL;
     tstate->datastack_limit = NULL;
@@ -1660,7 +1684,7 @@ PyThreadState_Clear(PyThreadState *tstate)
 
     int verbose = _PyInterpreterState_GetConfig(tstate->interp)->verbose;
 
-    if (verbose && tstate->current_frame != NULL) {
+    if (verbose && tstate->current_frame != tstate->base_frame) {
         /* bpo-20526: After the main thread calls
            _PyInterpreterState_SetFinalizing() in Py_FinalizeEx()
            (or in Py_EndInterpreter() for subinterpreters),
diff --git a/Python/remote_debug.h b/Python/remote_debug.h
index 517568358a0..1c02870d3af 100644
--- a/Python/remote_debug.h
+++ b/Python/remote_debug.h
@@ -1102,6 +1102,115 @@ _Py_RemoteDebug_ReadRemoteMemory(proc_handle_t *handle, uintptr_t remote_address
 #endif
 }
 
+#if defined(__linux__) && HAVE_PROCESS_VM_READV
+// Fallback write using /proc/pid/mem
+static int
+_Py_RemoteDebug_WriteRemoteMemoryFallback(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
+{
+    if (handle->memfd == -1) {
+        if (open_proc_mem_fd(handle) < 0) {
+            return -1;
+        }
+    }
+
+    struct iovec local[1];
+    Py_ssize_t result = 0;
+    Py_ssize_t written = 0;
+
+    do {
+        local[0].iov_base = (char*)src + result;
+        local[0].iov_len = len - result;
+        off_t offset = remote_address + result;
+
+        written = pwritev(handle->memfd, local, 1, offset);
+        if (written < 0) {
+            PyErr_SetFromErrno(PyExc_OSError);
+            return -1;
+        }
+
+        result += written;
+    } while ((size_t)written != local[0].iov_len);
+    return 0;
+}
+#endif // __linux__
+
+// Platform-independent memory write function
+UNUSED static int
+_Py_RemoteDebug_WriteRemoteMemory(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
+{
+#ifdef MS_WINDOWS
+    SIZE_T written = 0;
+    SIZE_T result = 0;
+    do {
+        if (!WriteProcessMemory(handle->hProcess, (LPVOID)(remote_address + result), (const char*)src + result, len - result, &written)) {
+            PyErr_SetFromWindowsErr(0);
+            DWORD error = GetLastError();
+            _set_debug_exception_cause(PyExc_OSError,
+                "WriteProcessMemory failed for PID %d at address 0x%lx "
+                "(size %zu, partial write %zu bytes): Windows error %lu",
+                handle->pid, remote_address + result, len - result, result, error);
+            return -1;
+        }
+        result += written;
+    } while (result < len);
+    return 0;
+#elif defined(__linux__) && HAVE_PROCESS_VM_READV
+    if (handle->memfd != -1) {
+        return _Py_RemoteDebug_WriteRemoteMemoryFallback(handle, remote_address, len, src);
+    }
+    struct iovec local[1];
+    struct iovec remote[1];
+    Py_ssize_t result = 0;
+    Py_ssize_t written = 0;
+
+    do {
+        local[0].iov_base = (void*)((char*)src + result);
+        local[0].iov_len = len - result;
+        remote[0].iov_base = (void*)((char*)remote_address + result);
+        remote[0].iov_len = len - result;
+
+        written = process_vm_writev(handle->pid, local, 1, remote, 1, 0);
+        if (written < 0) {
+            if (errno == ENOSYS) {
+                return _Py_RemoteDebug_WriteRemoteMemoryFallback(handle, remote_address, len, src);
+            }
+            PyErr_SetFromErrno(PyExc_OSError);
+            _set_debug_exception_cause(PyExc_OSError,
+                "process_vm_writev failed for PID %d at address 0x%lx "
+                "(size %zu, partial write %zd bytes): %s",
+                handle->pid, remote_address + result, len - result, result, strerror(errno));
+            return -1;
+        }
+
+        result += written;
+    } while ((size_t)written != local[0].iov_len);
+    return 0;
+#elif defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX
+    kern_return_t kr = mach_vm_write(
+        handle->task,
+        (mach_vm_address_t)remote_address,
+        (vm_offset_t)src,
+        (mach_msg_type_number_t)len);
+
+    if (kr != KERN_SUCCESS) {
+        switch (kr) {
+        case KERN_PROTECTION_FAILURE:
+            PyErr_SetString(PyExc_PermissionError, "Not enough permissions to write memory");
+            break;
+        case KERN_INVALID_ARGUMENT:
+            PyErr_SetString(PyExc_PermissionError, "Invalid argument to mach_vm_write");
+            break;
+        default:
+            PyErr_Format(PyExc_RuntimeError, "Unknown error writing memory: %d", (int)kr);
+        }
+        return -1;
+    }
+    return 0;
+#else
+    Py_UNREACHABLE();
+#endif
+}
+
 UNUSED static int
 _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle,
                                       uintptr_t addr,
diff --git a/Python/remote_debugging.c b/Python/remote_debugging.c
index 71ffb17ed68..5b50b95db94 100644
--- a/Python/remote_debugging.c
+++ b/Python/remote_debugging.c
@@ -24,104 +24,11 @@ read_memory(proc_handle_t *handle, uintptr_t remote_address, size_t len, void* d
     return _Py_RemoteDebug_ReadRemoteMemory(handle, remote_address, len, dst);
 }
 
-// Why is pwritev not guarded? Except on Android API level 23 (no longer
-// supported), HAVE_PROCESS_VM_READV is sufficient.
-#if defined(__linux__) && HAVE_PROCESS_VM_READV
-static int
-write_memory_fallback(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
-{
-    if (handle->memfd == -1) {
-        if (open_proc_mem_fd(handle) < 0) {
-            return -1;
-        }
-    }
-
-    struct iovec local[1];
-    Py_ssize_t result = 0;
-    Py_ssize_t written = 0;
-
-    do {
-        local[0].iov_base = (char*)src + result;
-        local[0].iov_len = len - result;
-        off_t offset = remote_address + result;
-
-        written = pwritev(handle->memfd, local, 1, offset);
-        if (written < 0) {
-            PyErr_SetFromErrno(PyExc_OSError);
-            return -1;
-        }
-
-        result += written;
-    } while ((size_t)written != local[0].iov_len);
-    return 0;
-}
-#endif // __linux__
-
+// Use the shared write function from remote_debug.h
 static int
 write_memory(proc_handle_t *handle, uintptr_t remote_address, size_t len, const void* src)
 {
-#ifdef MS_WINDOWS
-    SIZE_T written = 0;
-    SIZE_T result = 0;
-    do {
-        if (!WriteProcessMemory(handle->hProcess, (LPVOID)(remote_address + result), (const char*)src + result, len - result, &written)) {
-            PyErr_SetFromWindowsErr(0);
-            return -1;
-        }
-        result += written;
-    } while (result < len);
-    return 0;
-#elif defined(__linux__) && HAVE_PROCESS_VM_READV
-    if (handle->memfd != -1) {
-        return write_memory_fallback(handle, remote_address, len, src);
-    }
-    struct iovec local[1];
-    struct iovec remote[1];
-    Py_ssize_t result = 0;
-    Py_ssize_t written = 0;
-
-    do {
-        local[0].iov_base = (void*)((char*)src + result);
-        local[0].iov_len = len - result;
-        remote[0].iov_base = (void*)((char*)remote_address + result);
-        remote[0].iov_len = len - result;
-
-        written = process_vm_writev(handle->pid, local, 1, remote, 1, 0);
-        if (written < 0) {
-            if (errno == ENOSYS) {
-                return write_memory_fallback(handle, remote_address, len, src);
-            }
-            PyErr_SetFromErrno(PyExc_OSError);
-            return -1;
-        }
-
-        result += written;
-    } while ((size_t)written != local[0].iov_len);
-    return 0;
-#elif defined(__APPLE__) && TARGET_OS_OSX
-    kern_return_t kr = mach_vm_write(
-        pid_to_task(handle->pid),
-        (mach_vm_address_t)remote_address,
-        (vm_offset_t)src,
-        (mach_msg_type_number_t)len);
-
-    if (kr != KERN_SUCCESS) {
-        switch (kr) {
-        case KERN_PROTECTION_FAILURE:
-            PyErr_SetString(PyExc_PermissionError, "Not enough permissions to write memory");
-            break;
-        case KERN_INVALID_ARGUMENT:
-            PyErr_SetString(PyExc_PermissionError, "Invalid argument to mach_vm_write");
-            break;
-        default:
-            PyErr_Format(PyExc_RuntimeError, "Unknown error writing memory: %d", (int)kr);
-        }
-        return -1;
-    }
-    return 0;
-#else
-    Py_UNREACHABLE();
-#endif
+    return _Py_RemoteDebug_WriteRemoteMemory(handle, remote_address, len, src);
 }
 
 static int
diff --git a/Python/traceback.c b/Python/traceback.c
index 48f9b4d04c6..8af63c22a9f 100644
--- a/Python/traceback.c
+++ b/Python/traceback.c
@@ -1036,7 +1036,7 @@ static int
 dump_frame(int fd, _PyInterpreterFrame *frame)
 {
     if (frame->owner == FRAME_OWNED_BY_INTERPRETER) {
-        /* Ignore trampoline frame */
+        /* Ignore trampoline frames and base frame sentinel */
         return 0;
     }
 
diff --git a/Tools/inspection/benchmark_external_inspection.py b/Tools/inspection/benchmark_external_inspection.py
index 0ac7ac4d385..9c40c2f4492 100644
--- a/Tools/inspection/benchmark_external_inspection.py
+++ b/Tools/inspection/benchmark_external_inspection.py
@@ -434,7 +434,7 @@ def main():
                     elif args.threads == "only_active":
                         kwargs["only_active_thread"] = True
                     unwinder = _remote_debugging.RemoteUnwinder(
-                        process.pid, **kwargs
+                        process.pid, cache_frames=True, **kwargs
                     )
                     results = benchmark(unwinder, duration_seconds=args.duration)
                 finally: