gh-135953: Add GIL contention markers to sampling profiler Gecko format (#139485)

This commit enhances the Gecko format reporter in the sampling profiler to include markers for GIL acquisition events.
2025-12-07 13:50:06 +00:00 · 2025-11-17 12:46:26 +00:00 · 2025-11-17 12:46:26 +00:00 · 89a914c58d
commit 89a914c58d
parent 994ab5c922
9 changed files with 627 additions and 82 deletions
--- a/Include/cpython/pystate.h
+++ b/Include/cpython/pystate.h
@ -113,6 +113,9 @@ struct _ts {
    /* Currently holds the GIL. Must be its own field to avoid data races */
    int holds_gil;

+    /* Currently requesting the GIL */
+    int gil_requested;
+
    int _whence;

    /* Thread state (_Py_THREAD_ATTACHED, _Py_THREAD_DETACHED, _Py_THREAD_SUSPENDED).
--- a/Include/internal/pycore_debug_offsets.h
+++ b/Include/internal/pycore_debug_offsets.h
@ -106,6 +106,8 @@ typedef struct _Py_DebugOffsets {
        uint64_t native_thread_id;
        uint64_t datastack_chunk;
        uint64_t status;
+        uint64_t holds_gil;
+        uint64_t gil_requested;
    } thread_state;

    // InterpreterFrame offset;
@ -273,6 +275,8 @@ typedef struct _Py_DebugOffsets {
        .native_thread_id = offsetof(PyThreadState, native_thread_id), \
        .datastack_chunk = offsetof(PyThreadState, datastack_chunk), \
        .status = offsetof(PyThreadState, _status), \
+        .holds_gil = offsetof(PyThreadState, holds_gil), \
+        .gil_requested = offsetof(PyThreadState, gil_requested), \
    }, \
    .interpreter_frame = { \
        .size = sizeof(_PyInterpreterFrame), \
--- a/Lib/profiling/sampling/collector.py
+++ b/Lib/profiling/sampling/collector.py
@ -1,17 +1,14 @@
 from abc import ABC, abstractmethod

-# Enums are slow
-THREAD_STATE_RUNNING = 0
-THREAD_STATE_IDLE = 1
-THREAD_STATE_GIL_WAIT = 2
-THREAD_STATE_UNKNOWN = 3
-
-STATUS = {
-    THREAD_STATE_RUNNING: "running",
-    THREAD_STATE_IDLE: "idle",
-    THREAD_STATE_GIL_WAIT: "gil_wait",
-    THREAD_STATE_UNKNOWN: "unknown",
-}
+# Thread status flags
+try:
+    from _remote_debugging import THREAD_STATUS_HAS_GIL, THREAD_STATUS_ON_CPU, THREAD_STATUS_UNKNOWN, THREAD_STATUS_GIL_REQUESTED
+except ImportError:
+    # Fallback for tests or when module is not available
+    THREAD_STATUS_HAS_GIL = (1 << 0)
+    THREAD_STATUS_ON_CPU = (1 << 1)
+    THREAD_STATUS_UNKNOWN = (1 << 2)
+    THREAD_STATUS_GIL_REQUESTED = (1 << 3)

 class Collector(ABC):
    @abstractmethod
@ -26,8 +23,14 @@ def _iter_all_frames(self, stack_frames, skip_idle=False):
        """Iterate over all frame stacks from all interpreters and threads."""
        for interpreter_info in stack_frames:
            for thread_info in interpreter_info.threads:
-                if skip_idle and thread_info.status != THREAD_STATE_RUNNING:
-                    continue
+                # skip_idle now means: skip if thread is not actively running
+                # A thread is "active" if it has the GIL OR is on CPU
+                if skip_idle:
+                    status_flags = thread_info.status
+                    has_gil = bool(status_flags & THREAD_STATUS_HAS_GIL)
+                    on_cpu = bool(status_flags & THREAD_STATUS_ON_CPU)
+                    if not (has_gil or on_cpu):
+                        continue
                frames = thread_info.frame_info
                if frames:
                    yield frames, thread_info.thread_id
--- a/Lib/profiling/sampling/gecko_collector.py
+++ b/Lib/profiling/sampling/gecko_collector.py
@ -1,9 +1,20 @@
+import itertools
 import json
 import os
 import platform
+import sys
+import threading
 import time

-from .collector import Collector, THREAD_STATE_RUNNING
+from .collector import Collector
+try:
+    from _remote_debugging import THREAD_STATUS_HAS_GIL, THREAD_STATUS_ON_CPU, THREAD_STATUS_UNKNOWN, THREAD_STATUS_GIL_REQUESTED
+except ImportError:
+    # Fallback if module not available (shouldn't happen in normal use)
+    THREAD_STATUS_HAS_GIL = (1 << 0)
+    THREAD_STATUS_ON_CPU = (1 << 1)
+    THREAD_STATUS_UNKNOWN = (1 << 2)
+    THREAD_STATUS_GIL_REQUESTED = (1 << 3)


 # Categories matching Firefox Profiler expectations
@ -11,14 +22,20 @@
    {"name": "Other", "color": "grey", "subcategories": ["Other"]},
    {"name": "Python", "color": "yellow", "subcategories": ["Other"]},
    {"name": "Native", "color": "blue", "subcategories": ["Other"]},
-    {"name": "Idle", "color": "transparent", "subcategories": ["Other"]},
+    {"name": "GC", "color": "orange", "subcategories": ["Other"]},
+    {"name": "GIL", "color": "green", "subcategories": ["Other"]},
+    {"name": "CPU", "color": "purple", "subcategories": ["Other"]},
+    {"name": "Code Type", "color": "red", "subcategories": ["Other"]},
 ]

 # Category indices
 CATEGORY_OTHER = 0
 CATEGORY_PYTHON = 1
 CATEGORY_NATIVE = 2
-CATEGORY_IDLE = 3
+CATEGORY_GC = 3
+CATEGORY_GIL = 4
+CATEGORY_CPU = 5
+CATEGORY_CODE_TYPE = 6

 # Subcategory indices
 DEFAULT_SUBCATEGORY = 0
@ -58,6 +75,56 @@ def __init__(self, *, skip_idle=False):
        self.last_sample_time = 0
        self.interval = 1.0  # Will be calculated from actual sampling

+        # State tracking for interval markers (tid -> start_time)
+        self.has_gil_start = {}           # Thread has the GIL
+        self.no_gil_start = {}            # Thread doesn't have the GIL
+        self.on_cpu_start = {}            # Thread is running on CPU
+        self.off_cpu_start = {}           # Thread is off CPU
+        self.python_code_start = {}       # Thread running Python code (has GIL)
+        self.native_code_start = {}       # Thread running native code (on CPU without GIL)
+        self.gil_wait_start = {}          # Thread waiting for GIL
+
+        # GC event tracking: track GC start time per thread
+        self.gc_start_per_thread = {}  # tid -> start_time
+
+        # Track which threads have been initialized for state tracking
+        self.initialized_threads = set()
+
+    def _track_state_transition(self, tid, condition, active_dict, inactive_dict,
+                                  active_name, inactive_name, category, current_time):
+        """Track binary state transitions and emit markers.
+
+        Args:
+            tid: Thread ID
+            condition: Whether the active state is true
+            active_dict: Dict tracking start time of active state
+            inactive_dict: Dict tracking start time of inactive state
+            active_name: Name for active state marker
+            inactive_name: Name for inactive state marker
+            category: Gecko category for the markers
+            current_time: Current timestamp
+        """
+        # On first observation of a thread, just record the current state
+        # without creating a marker (we don't know what the previous state was)
+        if tid not in self.initialized_threads:
+            if condition:
+                active_dict[tid] = current_time
+            else:
+                inactive_dict[tid] = current_time
+            return
+
+        # For already-initialized threads, track transitions
+        if condition:
+            active_dict.setdefault(tid, current_time)
+            if tid in inactive_dict:
+                self._add_marker(tid, inactive_name, inactive_dict.pop(tid),
+                               current_time, category)
+        else:
+            inactive_dict.setdefault(tid, current_time)
+            if tid in active_dict:
+                self._add_marker(tid, active_name, active_dict.pop(tid),
+                               current_time, category)
+
    def collect(self, stack_frames):
        """Collect a sample from stack frames."""
        current_time = (time.time() * 1000) - self.start_time
@ -69,19 +136,12 @@ def collect(self, stack_frames):
            ) / self.sample_count
        self.last_sample_time = current_time

+        # Process threads and track GC per thread
        for interpreter_info in stack_frames:
            for thread_info in interpreter_info.threads:
-                if (
-                    self.skip_idle
-                    and thread_info.status != THREAD_STATE_RUNNING
-                ):
-                    continue
-
                frames = thread_info.frame_info
-                if not frames:
-                    continue
-
                tid = thread_info.thread_id
+                gc_collecting = thread_info.gc_collecting

                # Initialize thread if needed
                if tid not in self.threads:
@ -89,6 +149,80 @@ def collect(self, stack_frames):

                thread_data = self.threads[tid]

+                # Decode status flags
+                status_flags = thread_info.status
+                has_gil = bool(status_flags & THREAD_STATUS_HAS_GIL)
+                on_cpu = bool(status_flags & THREAD_STATUS_ON_CPU)
+                gil_requested = bool(status_flags & THREAD_STATUS_GIL_REQUESTED)
+
+                # Track GIL possession (Has GIL / No GIL)
+                self._track_state_transition(
+                    tid, has_gil, self.has_gil_start, self.no_gil_start,
+                    "Has GIL", "No GIL", CATEGORY_GIL, current_time
+                )
+
+                # Track CPU state (On CPU / Off CPU)
+                self._track_state_transition(
+                    tid, on_cpu, self.on_cpu_start, self.off_cpu_start,
+                    "On CPU", "Off CPU", CATEGORY_CPU, current_time
+                )
+
+                # Track code type (Python Code / Native Code)
+                # This is tri-state: Python (has_gil), Native (on_cpu without gil), or Neither
+                if has_gil:
+                    self._track_state_transition(
+                        tid, True, self.python_code_start, self.native_code_start,
+                        "Python Code", "Native Code", CATEGORY_CODE_TYPE, current_time
+                    )
+                elif on_cpu:
+                    self._track_state_transition(
+                        tid, True, self.native_code_start, self.python_code_start,
+                        "Native Code", "Python Code", CATEGORY_CODE_TYPE, current_time
+                    )
+                else:
+                    # Thread is idle (neither has GIL nor on CPU) - close any open code markers
+                    # This handles the third state that _track_state_transition doesn't cover
+                    if tid in self.initialized_threads:
+                        if tid in self.python_code_start:
+                            self._add_marker(tid, "Python Code", self.python_code_start.pop(tid),
+                                           current_time, CATEGORY_CODE_TYPE)
+                        if tid in self.native_code_start:
+                            self._add_marker(tid, "Native Code", self.native_code_start.pop(tid),
+                                           current_time, CATEGORY_CODE_TYPE)
+
+                # Track "Waiting for GIL" intervals (one-sided tracking)
+                if gil_requested:
+                    self.gil_wait_start.setdefault(tid, current_time)
+                elif tid in self.gil_wait_start:
+                    self._add_marker(tid, "Waiting for GIL", self.gil_wait_start.pop(tid),
+                                   current_time, CATEGORY_GIL)
+
+                # Track GC events - attribute to all threads that hold the GIL during GC
+                # (GC is interpreter-wide but runs on whichever thread(s) have the GIL)
+                # If GIL switches during GC, multiple threads will get GC markers
+                if gc_collecting and has_gil:
+                    # Start GC marker if not already started for this thread
+                    if tid not in self.gc_start_per_thread:
+                        self.gc_start_per_thread[tid] = current_time
+                elif tid in self.gc_start_per_thread:
+                    # End GC marker if it was running for this thread
+                    # (either GC finished or thread lost GIL)
+                    self._add_marker(tid, "GC Collecting", self.gc_start_per_thread.pop(tid),
+                                   current_time, CATEGORY_GC)
+
+                # Mark thread as initialized after processing all state transitions
+                self.initialized_threads.add(tid)
+
+                # Categorize: idle if neither has GIL nor on CPU
+                is_idle = not has_gil and not on_cpu
+
+                # Skip idle threads if skip_idle is enabled
+                if self.skip_idle and is_idle:
+                    continue
+
+                if not frames:
+                    continue
+
                # Process the stack
                stack_index = self._process_stack(thread_data, frames)

@ -102,7 +236,6 @@ def collect(self, stack_frames):

    def _create_thread(self, tid):
        """Create a new thread structure with processed profile format."""
-        import threading

        # Determine if this is the main thread
        try:
@ -181,7 +314,7 @@ def _create_thread(self, tid):
                "functionSize": [],
                "length": 0,
            },
-            # Markers - processed format
+            # Markers - processed format (arrays)
            "markers": {
                "data": [],
                "name": [],
@ -215,6 +348,27 @@ def _intern_string(self, s):
        self.global_string_map[s] = idx
        return idx

+    def _add_marker(self, tid, name, start_time, end_time, category):
+        """Add an interval marker for a specific thread."""
+        if tid not in self.threads:
+            return
+
+        thread_data = self.threads[tid]
+        duration = end_time - start_time
+
+        name_idx = self._intern_string(name)
+        markers = thread_data["markers"]
+        markers["name"].append(name_idx)
+        markers["startTime"].append(start_time)
+        markers["endTime"].append(end_time)
+        markers["phase"].append(1)  # 1 = interval marker
+        markers["category"].append(category)
+        markers["data"].append({
+            "type": name.replace(" ", ""),
+            "duration": duration,
+            "tid": tid
+        })
+
    def _process_stack(self, thread_data, frames):
        """Process a stack and return the stack index."""
        if not frames:
@ -383,15 +537,63 @@ def _get_or_create_frame(self, thread_data, func_idx, lineno):
        frame_cache[frame_key] = frame_idx
        return frame_idx

+    def _finalize_markers(self):
+        """Close any open markers at the end of profiling."""
+        end_time = self.last_sample_time
+
+        # Close all open markers for each thread using a generic approach
+        marker_states = [
+            (self.has_gil_start, "Has GIL", CATEGORY_GIL),
+            (self.no_gil_start, "No GIL", CATEGORY_GIL),
+            (self.on_cpu_start, "On CPU", CATEGORY_CPU),
+            (self.off_cpu_start, "Off CPU", CATEGORY_CPU),
+            (self.python_code_start, "Python Code", CATEGORY_CODE_TYPE),
+            (self.native_code_start, "Native Code", CATEGORY_CODE_TYPE),
+            (self.gil_wait_start, "Waiting for GIL", CATEGORY_GIL),
+            (self.gc_start_per_thread, "GC Collecting", CATEGORY_GC),
+        ]
+
+        for state_dict, marker_name, category in marker_states:
+            for tid in list(state_dict.keys()):
+                self._add_marker(tid, marker_name, state_dict[tid], end_time, category)
+                del state_dict[tid]
+
    def export(self, filename):
        """Export the profile to a Gecko JSON file."""
+
        if self.sample_count > 0 and self.last_sample_time > 0:
            self.interval = self.last_sample_time / self.sample_count

-        profile = self._build_profile()
+        # Spinner for progress indication
+        spinner = itertools.cycle(['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'])
+        stop_spinner = threading.Event()

-        with open(filename, "w") as f:
-            json.dump(profile, f, separators=(",", ":"))
+        def spin():
+            message = 'Building Gecko profile...'
+            while not stop_spinner.is_set():
+                sys.stderr.write(f'\r{next(spinner)} {message}')
+                sys.stderr.flush()
+                time.sleep(0.1)
+            # Clear the spinner line
+            sys.stderr.write('\r' + ' ' * (len(message) + 3) + '\r')
+            sys.stderr.flush()
+
+        spinner_thread = threading.Thread(target=spin, daemon=True)
+        spinner_thread.start()
+
+        try:
+            # Finalize any open markers before building profile
+            self._finalize_markers()
+
+            profile = self._build_profile()
+
+            with open(filename, "w") as f:
+                json.dump(profile, f, separators=(",", ":"))
+        finally:
+            stop_spinner.set()
+            spinner_thread.join(timeout=1.0)
+            # Small delay to ensure the clear happens
+            time.sleep(0.01)

        print(f"Gecko profile written to {filename}")
        print(
@ -416,6 +618,7 @@ def _build_profile(self):
            frame_table["length"] = len(frame_table["func"])
            func_table["length"] = len(func_table["name"])
            resource_table["length"] = len(resource_table["name"])
+            thread_data["markers"]["length"] = len(thread_data["markers"]["name"])

            # Clean up internal caches
            del thread_data["_stackCache"]
--- a/Lib/profiling/sampling/sample.py
+++ b/Lib/profiling/sampling/sample.py
@ -21,6 +21,7 @@
 PROFILING_MODE_WALL = 0
 PROFILING_MODE_CPU = 1
 PROFILING_MODE_GIL = 2
+PROFILING_MODE_ALL = 3  # Combines GIL + CPU checks


 def _parse_mode(mode_string):
@ -136,18 +137,20 @@ def _run_with_sync(original_cmd):


 class SampleProfiler:
-    def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL):
+    def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL, skip_non_matching_threads=True):
        self.pid = pid
        self.sample_interval_usec = sample_interval_usec
        self.all_threads = all_threads
        if _FREE_THREADED_BUILD:
            self.unwinder = _remote_debugging.RemoteUnwinder(
-                self.pid, all_threads=self.all_threads, mode=mode
+                self.pid, all_threads=self.all_threads, mode=mode,
+                skip_non_matching_threads=skip_non_matching_threads
            )
        else:
            only_active_threads = bool(self.all_threads)
            self.unwinder = _remote_debugging.RemoteUnwinder(
-                self.pid, only_active_thread=only_active_threads, mode=mode
+                self.pid, only_active_thread=only_active_threads, mode=mode,
+                skip_non_matching_threads=skip_non_matching_threads
            )
        # Track sample intervals and total sample count
        self.sample_intervals = deque(maxlen=100)
@ -614,14 +617,21 @@ def sample(
    realtime_stats=False,
    mode=PROFILING_MODE_WALL,
 ):
+    # PROFILING_MODE_ALL implies no skipping at all
+    if mode == PROFILING_MODE_ALL:
+        skip_non_matching_threads = False
+        skip_idle = False
+    else:
+        # Determine skip settings based on output format and mode
+        skip_non_matching_threads = output_format != "gecko"
+        skip_idle = mode != PROFILING_MODE_WALL
+
    profiler = SampleProfiler(
-        pid, sample_interval_usec, all_threads=all_threads, mode=mode
+        pid, sample_interval_usec, all_threads=all_threads, mode=mode,
+        skip_non_matching_threads=skip_non_matching_threads
    )
    profiler.realtime_stats = realtime_stats

-    # Determine skip_idle for collector compatibility
-    skip_idle = mode != PROFILING_MODE_WALL
-
    collector = None
    match output_format:
        case "pstats":
@ -633,7 +643,8 @@ def sample(
            collector = FlamegraphCollector(skip_idle=skip_idle)
            filename = filename or f"flamegraph.{pid}.html"
        case "gecko":
-            collector = GeckoCollector(skip_idle=skip_idle)
+            # Gecko format never skips idle threads to show full thread states
+            collector = GeckoCollector(skip_idle=False)
            filename = filename or f"gecko.{pid}.json"
        case _:
            raise ValueError(f"Invalid output format: {output_format}")
@ -882,6 +893,10 @@ def main():
    if args.format in ("collapsed", "gecko"):
        _validate_collapsed_format_args(args, parser)

+    # Validate that --mode is not used with --gecko
+    if args.format == "gecko" and args.mode != "wall":
+        parser.error("--mode option is incompatible with --gecko format. Gecko format automatically uses ALL mode (GIL + CPU analysis).")
+
    sort_value = args.sort if args.sort is not None else 2

    if args.module is not None and not args.module:
@ -900,7 +915,11 @@ def main():
    elif target_count > 1:
        parser.error("only one target type can be specified: -p/--pid, -m/--module, or script")

-    mode = _parse_mode(args.mode)
+    # Use PROFILING_MODE_ALL for gecko format, otherwise parse user's choice
+    if args.format == "gecko":
+        mode = PROFILING_MODE_ALL
+    else:
+        mode = _parse_mode(args.mode)

    if args.pid:
        sample(
--- a/Lib/test/test_external_inspection.py
+++ b/Lib/test/test_external_inspection.py
@ -23,6 +23,12 @@
 PROFILING_MODE_WALL = 0
 PROFILING_MODE_CPU = 1
 PROFILING_MODE_GIL = 2
+PROFILING_MODE_ALL = 3
+
+# Thread status flags
+THREAD_STATUS_HAS_GIL = (1 << 0)
+THREAD_STATUS_ON_CPU = (1 << 1)
+THREAD_STATUS_UNKNOWN = (1 << 2)

 try:
    from concurrent import interpreters
@ -1763,11 +1769,14 @@ def busy():
                            for thread_info in interpreter_info.threads:
                                statuses[thread_info.thread_id] = thread_info.status

-                        # Check if sleeper thread is idle and busy thread is running
+                        # Check if sleeper thread is off CPU and busy thread is on CPU
+                        # In the new flags system:
+                        # - sleeper should NOT have ON_CPU flag (off CPU)
+                        # - busy should have ON_CPU flag
                        if (sleeper_tid in statuses and
                            busy_tid in statuses and
-                            statuses[sleeper_tid] == 1 and
-                            statuses[busy_tid] == 0):
+                            not (statuses[sleeper_tid] & THREAD_STATUS_ON_CPU) and
+                            (statuses[busy_tid] & THREAD_STATUS_ON_CPU)):
                            break
                        time.sleep(0.5)  # Give a bit of time to let threads settle
                except PermissionError:
@ -1779,8 +1788,8 @@ def busy():
                self.assertIsNotNone(busy_tid, "Busy thread id not received")
                self.assertIn(sleeper_tid, statuses, "Sleeper tid not found in sampled threads")
                self.assertIn(busy_tid, statuses, "Busy tid not found in sampled threads")
-                self.assertEqual(statuses[sleeper_tid], 1, "Sleeper thread should be idle (1)")
-                self.assertEqual(statuses[busy_tid], 0, "Busy thread should be running (0)")
+                self.assertFalse(statuses[sleeper_tid] & THREAD_STATUS_ON_CPU, "Sleeper thread should be off CPU")
+                self.assertTrue(statuses[busy_tid] & THREAD_STATUS_ON_CPU, "Busy thread should be on CPU")

            finally:
                if client_socket is not None:
@ -1875,11 +1884,14 @@ def busy():
                            for thread_info in interpreter_info.threads:
                                statuses[thread_info.thread_id] = thread_info.status

-                        # Check if sleeper thread is idle (status 2 for GIL mode) and busy thread is running
+                        # Check if sleeper thread doesn't have GIL and busy thread has GIL
+                        # In the new flags system:
+                        # - sleeper should NOT have HAS_GIL flag (waiting for GIL)
+                        # - busy should have HAS_GIL flag
                        if (sleeper_tid in statuses and
                            busy_tid in statuses and
-                            statuses[sleeper_tid] == 2 and
-                            statuses[busy_tid] == 0):
+                            not (statuses[sleeper_tid] & THREAD_STATUS_HAS_GIL) and
+                            (statuses[busy_tid] & THREAD_STATUS_HAS_GIL)):
                            break
                        time.sleep(0.5)  # Give a bit of time to let threads settle
                except PermissionError:
@ -1891,8 +1903,8 @@ def busy():
                self.assertIsNotNone(busy_tid, "Busy thread id not received")
                self.assertIn(sleeper_tid, statuses, "Sleeper tid not found in sampled threads")
                self.assertIn(busy_tid, statuses, "Busy tid not found in sampled threads")
-                self.assertEqual(statuses[sleeper_tid], 2, "Sleeper thread should be idle (1)")
-                self.assertEqual(statuses[busy_tid], 0, "Busy thread should be running (0)")
+                self.assertFalse(statuses[sleeper_tid] & THREAD_STATUS_HAS_GIL, "Sleeper thread should not have GIL")
+                self.assertTrue(statuses[busy_tid] & THREAD_STATUS_HAS_GIL, "Busy thread should have GIL")

            finally:
                if client_socket is not None:
@ -1900,6 +1912,128 @@ def busy():
                p.terminate()
                p.wait(timeout=SHORT_TIMEOUT)

+    @unittest.skipIf(
+        sys.platform not in ("linux", "darwin", "win32"),
+        "Test only runs on supported platforms (Linux, macOS, or Windows)",
+    )
+    @unittest.skipIf(sys.platform == "android", "Android raises Linux-specific exception")
+    def test_thread_status_all_mode_detection(self):
+        port = find_unused_port()
+        script = textwrap.dedent(
+            f"""\
+            import socket
+            import threading
+            import time
+            import sys
+
+            def sleeper_thread():
+                conn = socket.create_connection(("localhost", {port}))
+                conn.sendall(b"sleeper:" + str(threading.get_native_id()).encode())
+                while True:
+                    time.sleep(1)
+
+            def busy_thread():
+                conn = socket.create_connection(("localhost", {port}))
+                conn.sendall(b"busy:" + str(threading.get_native_id()).encode())
+                while True:
+                    sum(range(100000))
+
+            t1 = threading.Thread(target=sleeper_thread)
+            t2 = threading.Thread(target=busy_thread)
+            t1.start()
+            t2.start()
+            t1.join()
+            t2.join()
+            """
+        )
+
+        with os_helper.temp_dir() as tmp_dir:
+            script_file = make_script(tmp_dir, "script", script)
+            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            server_socket.bind(("localhost", port))
+            server_socket.listen(2)
+            server_socket.settimeout(SHORT_TIMEOUT)
+
+            p = subprocess.Popen(
+                [sys.executable, script_file],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+
+            client_sockets = []
+            try:
+                sleeper_tid = None
+                busy_tid = None
+
+                # Receive thread IDs from the child process
+                for _ in range(2):
+                    client_socket, _ = server_socket.accept()
+                    client_sockets.append(client_socket)
+                    line = client_socket.recv(1024)
+                    if line:
+                        if line.startswith(b"sleeper:"):
+                            try:
+                                sleeper_tid = int(line.split(b":")[-1])
+                            except Exception:
+                                pass
+                        elif line.startswith(b"busy:"):
+                            try:
+                                busy_tid = int(line.split(b":")[-1])
+                            except Exception:
+                                pass
+
+                server_socket.close()
+
+                attempts = 10
+                statuses = {}
+                try:
+                    unwinder = RemoteUnwinder(p.pid, all_threads=True, mode=PROFILING_MODE_ALL,
+                                                skip_non_matching_threads=False)
+                    for _ in range(attempts):
+                        traces = unwinder.get_stack_trace()
+                        # Find threads and their statuses
+                        statuses = {}
+                        for interpreter_info in traces:
+                            for thread_info in interpreter_info.threads:
+                                statuses[thread_info.thread_id] = thread_info.status
+
+                        # Check ALL mode provides both GIL and CPU info
+                        # - sleeper should NOT have ON_CPU and NOT have HAS_GIL
+                        # - busy should have ON_CPU and have HAS_GIL
+                        if (sleeper_tid in statuses and
+                            busy_tid in statuses and
+                            not (statuses[sleeper_tid] & THREAD_STATUS_ON_CPU) and
+                            not (statuses[sleeper_tid] & THREAD_STATUS_HAS_GIL) and
+                            (statuses[busy_tid] & THREAD_STATUS_ON_CPU) and
+                            (statuses[busy_tid] & THREAD_STATUS_HAS_GIL)):
+                            break
+                        time.sleep(0.5)
+                except PermissionError:
+                    self.skipTest(
+                        "Insufficient permissions to read the stack trace"
+                    )
+
+                self.assertIsNotNone(sleeper_tid, "Sleeper thread id not received")
+                self.assertIsNotNone(busy_tid, "Busy thread id not received")
+                self.assertIn(sleeper_tid, statuses, "Sleeper tid not found in sampled threads")
+                self.assertIn(busy_tid, statuses, "Busy tid not found in sampled threads")
+
+                # Sleeper thread: off CPU, no GIL
+                self.assertFalse(statuses[sleeper_tid] & THREAD_STATUS_ON_CPU, "Sleeper should be off CPU")
+                self.assertFalse(statuses[sleeper_tid] & THREAD_STATUS_HAS_GIL, "Sleeper should not have GIL")
+
+                # Busy thread: on CPU, has GIL
+                self.assertTrue(statuses[busy_tid] & THREAD_STATUS_ON_CPU, "Busy should be on CPU")
+                self.assertTrue(statuses[busy_tid] & THREAD_STATUS_HAS_GIL, "Busy should have GIL")
+
+            finally:
+                for client_socket in client_sockets:
+                    client_socket.close()
+                p.terminate()
+                p.wait(timeout=SHORT_TIMEOUT)
+                p.stdout.close()
+                p.stderr.close()


 if __name__ == "__main__":
--- a/Lib/test/test_profiling/test_sampling_profiler.py
+++ b/Lib/test/test_profiling/test_sampling_profiler.py
@ -63,12 +63,14 @@ def __repr__(self):
 class MockThreadInfo:
    """Mock ThreadInfo for testing since the real one isn't accessible."""

-    def __init__(self, thread_id, frame_info):
+    def __init__(self, thread_id, frame_info, status=0, gc_collecting=False):  # Default to THREAD_STATE_RUNNING (0)
        self.thread_id = thread_id
        self.frame_info = frame_info
+        self.status = status
+        self.gc_collecting = gc_collecting

    def __repr__(self):
-        return f"MockThreadInfo(thread_id={self.thread_id}, frame_info={self.frame_info})"
+        return f"MockThreadInfo(thread_id={self.thread_id}, frame_info={self.frame_info}, status={self.status}, gc_collecting={self.gc_collecting})"


 class MockInterpreterInfo:
@ -674,6 +676,97 @@ def test_gecko_collector_export(self):
        self.assertIn("func2", string_array)
        self.assertIn("other_func", string_array)

+    def test_gecko_collector_markers(self):
+        """Test Gecko profile markers for GIL and CPU state tracking."""
+        try:
+            from _remote_debugging import THREAD_STATUS_HAS_GIL, THREAD_STATUS_ON_CPU, THREAD_STATUS_GIL_REQUESTED
+        except ImportError:
+            THREAD_STATUS_HAS_GIL = (1 << 0)
+            THREAD_STATUS_ON_CPU = (1 << 1)
+            THREAD_STATUS_GIL_REQUESTED = (1 << 3)
+
+        collector = GeckoCollector()
+
+        # Status combinations for different thread states
+        HAS_GIL_ON_CPU = THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU  # Running Python code
+        NO_GIL_ON_CPU = THREAD_STATUS_ON_CPU  # Running native code
+        WAITING_FOR_GIL = THREAD_STATUS_GIL_REQUESTED  # Waiting for GIL
+
+        # Simulate thread state transitions
+        collector.collect([
+            MockInterpreterInfo(0, [
+                MockThreadInfo(1, [("test.py", 10, "python_func")], status=HAS_GIL_ON_CPU)
+            ])
+        ])
+
+        collector.collect([
+            MockInterpreterInfo(0, [
+                MockThreadInfo(1, [("test.py", 15, "wait_func")], status=WAITING_FOR_GIL)
+            ])
+        ])
+
+        collector.collect([
+            MockInterpreterInfo(0, [
+                MockThreadInfo(1, [("test.py", 20, "python_func2")], status=HAS_GIL_ON_CPU)
+            ])
+        ])
+
+        collector.collect([
+            MockInterpreterInfo(0, [
+                MockThreadInfo(1, [("native.c", 100, "native_func")], status=NO_GIL_ON_CPU)
+            ])
+        ])
+
+        profile_data = collector._build_profile()
+
+        # Verify we have threads with markers
+        self.assertIn("threads", profile_data)
+        self.assertEqual(len(profile_data["threads"]), 1)
+        thread_data = profile_data["threads"][0]
+
+        # Check markers exist
+        self.assertIn("markers", thread_data)
+        markers = thread_data["markers"]
+
+        # Should have marker arrays
+        self.assertIn("name", markers)
+        self.assertIn("startTime", markers)
+        self.assertIn("endTime", markers)
+        self.assertIn("category", markers)
+        self.assertGreater(markers["length"], 0, "Should have generated markers")
+
+        # Get marker names from string table
+        string_array = profile_data["shared"]["stringArray"]
+        marker_names = [string_array[idx] for idx in markers["name"]]
+
+        # Verify we have different marker types
+        marker_name_set = set(marker_names)
+
+        # Should have "Has GIL" markers (when thread had GIL)
+        self.assertIn("Has GIL", marker_name_set, "Should have 'Has GIL' markers")
+
+        # Should have "No GIL" markers (when thread didn't have GIL)
+        self.assertIn("No GIL", marker_name_set, "Should have 'No GIL' markers")
+
+        # Should have "On CPU" markers (when thread was on CPU)
+        self.assertIn("On CPU", marker_name_set, "Should have 'On CPU' markers")
+
+        # Should have "Waiting for GIL" markers (when thread was waiting)
+        self.assertIn("Waiting for GIL", marker_name_set, "Should have 'Waiting for GIL' markers")
+
+        # Verify marker structure
+        for i in range(markers["length"]):
+            # All markers should be interval markers (phase = 1)
+            self.assertEqual(markers["phase"][i], 1, f"Marker {i} should be interval marker")
+
+            # All markers should have valid time range
+            start_time = markers["startTime"][i]
+            end_time = markers["endTime"][i]
+            self.assertLessEqual(start_time, end_time, f"Marker {i} should have valid time range")
+
+            # All markers should have valid category
+            self.assertGreaterEqual(markers["category"][i], 0, f"Marker {i} should have valid category")
+
    def test_pstats_collector_export(self):
        collector = PstatsCollector(
            sample_interval_usec=1000000
@ -2625,19 +2718,30 @@ def test_mode_validation(self):

    def test_frames_filtered_with_skip_idle(self):
        """Test that frames are actually filtered when skip_idle=True."""
+        # Import thread status flags
+        try:
+            from _remote_debugging import THREAD_STATUS_HAS_GIL, THREAD_STATUS_ON_CPU
+        except ImportError:
+            THREAD_STATUS_HAS_GIL = (1 << 0)
+            THREAD_STATUS_ON_CPU = (1 << 1)
+
        # Create mock frames with different thread statuses
        class MockThreadInfoWithStatus:
            def __init__(self, thread_id, frame_info, status):
                self.thread_id = thread_id
                self.frame_info = frame_info
                self.status = status
+                self.gc_collecting = False
+
+        # Create test data: active thread (HAS_GIL | ON_CPU), idle thread (neither), and another active thread
+        ACTIVE_STATUS = THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU  # Has GIL and on CPU
+        IDLE_STATUS = 0  # Neither has GIL nor on CPU

-        # Create test data: running thread, idle thread, and another running thread
        test_frames = [
            MockInterpreterInfo(0, [
-                MockThreadInfoWithStatus(1, [MockFrameInfo("active1.py", 10, "active_func1")], 0),  # RUNNING
-                MockThreadInfoWithStatus(2, [MockFrameInfo("idle.py", 20, "idle_func")], 1),        # IDLE
-                MockThreadInfoWithStatus(3, [MockFrameInfo("active2.py", 30, "active_func2")], 0),  # RUNNING
+                MockThreadInfoWithStatus(1, [MockFrameInfo("active1.py", 10, "active_func1")], ACTIVE_STATUS),
+                MockThreadInfoWithStatus(2, [MockFrameInfo("idle.py", 20, "idle_func")], IDLE_STATUS),
+                MockThreadInfoWithStatus(3, [MockFrameInfo("active2.py", 30, "active_func2")], ACTIVE_STATUS),
            ])
        ]

--- a/Modules/_remote_debugging_module.c
+++ b/Modules/_remote_debugging_module.c
@ -11,6 +11,7 @@
 * HEADERS AND INCLUDES
 * ============================================================================ */

+#include <assert.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <stddef.h>
@ -81,6 +82,8 @@ typedef enum _WIN32_THREADSTATE {
 #define SIZEOF_TYPE_OBJ sizeof(PyTypeObject)
 #define SIZEOF_UNICODE_OBJ sizeof(PyUnicodeObject)
 #define SIZEOF_LONG_OBJ sizeof(PyLongObject)
+#define SIZEOF_GC_RUNTIME_STATE sizeof(struct _gc_runtime_state)
+#define SIZEOF_INTERPRETER_STATE sizeof(PyInterpreterState)

 // Calculate the minimum buffer size needed to read interpreter state fields
 // We need to read code_object_generation and potentially tlbc_generation
@ -178,8 +181,9 @@ static PyStructSequence_Desc CoroInfo_desc = {
 // ThreadInfo structseq type - replaces 2-tuple (thread_id, frame_info)
 static PyStructSequence_Field ThreadInfo_fields[] = {
    {"thread_id", "Thread ID"},
-    {"status", "Thread status"},
+    {"status", "Thread status (flags: HAS_GIL, ON_CPU, UNKNOWN or legacy enum)"},
    {"frame_info", "Frame information"},
+    {"gc_collecting", "Whether GC is collecting (interpreter-level)"},
    {NULL}
 };

@ -187,7 +191,7 @@ static PyStructSequence_Desc ThreadInfo_desc = {
    "_remote_debugging.ThreadInfo",
    "Information about a thread",
    ThreadInfo_fields,
-    2
+    3
 };

 // InterpreterInfo structseq type - replaces 2-tuple (interpreter_id, thread_list)
@ -247,9 +251,16 @@ enum _ThreadState {
 enum _ProfilingMode {
    PROFILING_MODE_WALL = 0,
    PROFILING_MODE_CPU = 1,
-    PROFILING_MODE_GIL = 2
+    PROFILING_MODE_GIL = 2,
+    PROFILING_MODE_ALL = 3  // Combines GIL + CPU checks
 };

+// Thread status flags (can be combined)
+#define THREAD_STATUS_HAS_GIL        (1 << 0)  // Thread has the GIL
+#define THREAD_STATUS_ON_CPU         (1 << 1)  // Thread is running on CPU
+#define THREAD_STATUS_UNKNOWN        (1 << 2)  // Status could not be determined
+#define THREAD_STATUS_GIL_REQUESTED  (1 << 3)  // Thread is waiting for the GIL
+
 typedef struct {
    PyObject_HEAD
    proc_handle_t handle;
@ -2650,34 +2661,70 @@ unwind_stack_for_thread(

    long tid = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id);

-    // Calculate thread status based on mode
-    int status = THREAD_STATE_UNKNOWN;
-    if (unwinder->mode == PROFILING_MODE_CPU) {
-        long pthread_id = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.thread_id);
-        status = get_thread_status(unwinder, tid, pthread_id);
-        if (status == -1) {
-            PyErr_Print();
-            PyErr_SetString(PyExc_RuntimeError, "Failed to get thread status");
-            goto error;
-        }
-    } else if (unwinder->mode == PROFILING_MODE_GIL) {
+    // Read GC collecting state from the interpreter (before any skip checks)
+    uintptr_t interp_addr = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.interp);
+
+    // Read the GC runtime state from the interpreter state
+    uintptr_t gc_addr = interp_addr + unwinder->debug_offsets.interpreter_state.gc;
+    char gc_state[SIZEOF_GC_RUNTIME_STATE];
+    if (_Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, gc_addr, unwinder->debug_offsets.gc.size, gc_state) < 0) {
+        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read GC state");
+        goto error;
+    }
+
+    int gc_collecting = GET_MEMBER(int, gc_state, unwinder->debug_offsets.gc.collecting);
+
+    // Calculate thread status using flags (always)
+    int status_flags = 0;
+
+    // Check GIL status
+    int has_gil = 0;
+    int gil_requested = 0;
 #ifdef Py_GIL_DISABLED
-        // All threads are considered running in free threading builds if they have a thread state attached
-        int active = GET_MEMBER(_thread_status, ts, unwinder->debug_offsets.thread_state.status).active;
-        status = active ? THREAD_STATE_RUNNING : THREAD_STATE_GIL_WAIT;
+    int active = GET_MEMBER(_thread_status, ts, unwinder->debug_offsets.thread_state.status).active;
+    has_gil = active;
 #else
-        status = (*current_tstate == gil_holder_tstate) ? THREAD_STATE_RUNNING : THREAD_STATE_GIL_WAIT;
+    // Read holds_gil directly from thread state
+    has_gil = GET_MEMBER(int, ts, unwinder->debug_offsets.thread_state.holds_gil);
+
+    // Check if thread is actively requesting the GIL
+    if (unwinder->debug_offsets.thread_state.gil_requested != 0) {
+        gil_requested = GET_MEMBER(int, ts, unwinder->debug_offsets.thread_state.gil_requested);
+    }
+
+    // Set GIL_REQUESTED flag if thread is waiting
+    if (!has_gil && gil_requested) {
+        status_flags |= THREAD_STATUS_GIL_REQUESTED;
+    }
 #endif
-    } else {
-        // PROFILING_MODE_WALL - all threads are considered running
-        status = THREAD_STATE_RUNNING;
+    if (has_gil) {
+        status_flags |= THREAD_STATUS_HAS_GIL;
+    }
+
+    // Assert that we never have both HAS_GIL and GIL_REQUESTED set at the same time
+    // This would indicate a race condition in the GIL state tracking
+    assert(!(has_gil && gil_requested));
+
+    // Check CPU status
+    long pthread_id = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.thread_id);
+    int cpu_status = get_thread_status(unwinder, tid, pthread_id);
+    if (cpu_status == -1) {
+        status_flags |= THREAD_STATUS_UNKNOWN;
+    } else if (cpu_status == THREAD_STATE_RUNNING) {
+        status_flags |= THREAD_STATUS_ON_CPU;
    }

    // Check if we should skip this thread based on mode
    int should_skip = 0;
-    if (unwinder->skip_non_matching_threads && status != THREAD_STATE_RUNNING &&
-        (unwinder->mode == PROFILING_MODE_CPU || unwinder->mode == PROFILING_MODE_GIL)) {
-        should_skip = 1;
+    if (unwinder->skip_non_matching_threads) {
+        if (unwinder->mode == PROFILING_MODE_CPU) {
+            // Skip if not on CPU
+            should_skip = !(status_flags & THREAD_STATUS_ON_CPU);
+        } else if (unwinder->mode == PROFILING_MODE_GIL) {
+            // Skip if doesn't have GIL
+            should_skip = !(status_flags & THREAD_STATUS_HAS_GIL);
+        }
+        // PROFILING_MODE_WALL and PROFILING_MODE_ALL never skip
    }

    if (should_skip) {
@ -2719,16 +2766,25 @@ unwind_stack_for_thread(
        goto error;
    }

-    PyObject *py_status = PyLong_FromLong(status);
+    // Always use status_flags
+    PyObject *py_status = PyLong_FromLong(status_flags);
    if (py_status == NULL) {
        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread status");
        goto error;
    }
-    PyErr_Print();

+    PyObject *py_gc_collecting = PyBool_FromLong(gc_collecting);
+    if (py_gc_collecting == NULL) {
+        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create gc_collecting");
+        Py_DECREF(py_status);
+        goto error;
+    }
+
+    // py_status contains status flags (bitfield)
    PyStructSequence_SetItem(result, 0, thread_id);
    PyStructSequence_SetItem(result, 1, py_status);  // Steals reference
    PyStructSequence_SetItem(result, 2, frame_info); // Steals reference
+    PyStructSequence_SetItem(result, 3, py_gc_collecting); // Steals reference

    cleanup_stack_chunks(&chunks);
    return result;
@ -3401,6 +3457,21 @@ _remote_debugging_exec(PyObject *m)
    if (rc < 0) {
        return -1;
    }
+
+    // Add thread status flag constants
+    if (PyModule_AddIntConstant(m, "THREAD_STATUS_HAS_GIL", THREAD_STATUS_HAS_GIL) < 0) {
+        return -1;
+    }
+    if (PyModule_AddIntConstant(m, "THREAD_STATUS_ON_CPU", THREAD_STATUS_ON_CPU) < 0) {
+        return -1;
+    }
+    if (PyModule_AddIntConstant(m, "THREAD_STATUS_UNKNOWN", THREAD_STATUS_UNKNOWN) < 0) {
+        return -1;
+    }
+    if (PyModule_AddIntConstant(m, "THREAD_STATUS_GIL_REQUESTED", THREAD_STATUS_GIL_REQUESTED) < 0) {
+        return -1;
+    }
+
    if (RemoteDebugging_InitState(st) < 0) {
        return -1;
    }
--- a/Python/ceval_gil.c
+++ b/Python/ceval_gil.c
@ -207,6 +207,7 @@ drop_gil_impl(PyThreadState *tstate, struct _gil_runtime_state *gil)
    _Py_atomic_store_int_relaxed(&gil->locked, 0);
    if (tstate != NULL) {
        tstate->holds_gil = 0;
+        tstate->gil_requested = 0;
    }
    COND_SIGNAL(gil->cond);
    MUTEX_UNLOCK(gil->mutex);
@ -320,6 +321,8 @@ take_gil(PyThreadState *tstate)

    MUTEX_LOCK(gil->mutex);

+    tstate->gil_requested = 1;
+
    int drop_requested = 0;
    while (_Py_atomic_load_int_relaxed(&gil->locked)) {
        unsigned long saved_switchnum = gil->switch_number;
@ -407,6 +410,7 @@ take_gil(PyThreadState *tstate)
    }
    assert(_PyThreadState_CheckConsistency(tstate));

+    tstate->gil_requested = 0;
    tstate->holds_gil = 1;
    _Py_unset_eval_breaker_bit(tstate, _PY_GIL_DROP_REQUEST_BIT);
    update_eval_breaker_for_thread(interp, tstate);