[release-branch.go1.24] runtime: handle system goroutines later in goroutine profiling

In Go 1.24 and earlier, it's possible for a just-starting finalizer
goroutine to have its stack traced in goroutine profiles even though
it shouldn't, because it wasn't visible to the goroutine profile STW.
This can also bump out other stacks, because the goroutine profiler
wasn't expecting to have another stack. Fix this by letting all
system goroutines participate in the goroutine profiler's state
machine, like in the CL this is cherry-picking. This ensures that the
finalizer goroutine will be counted as a system goroutine in this
just-starting state, but still composes with the old way of doing
things, because the finalizer goroutine is advanced to the terminal
state during the STW. In Go 1.25, this is fixing a slightly different
issue, but the root of the problem is the same: all goroutines should
participate in the profiler's state machine, and they do not.

For #74090.
Fixes #74363.

Change-Id: Icb9a164a033be22aaa942d19e828e895f700ca74
Reviewed-on: https://go-review.googlesource.com/c/go/+/680477
Reviewed-by: Carlos Amedee <carlos@golang.org>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
(cherry picked from commit 281cfcfc1b)
Reviewed-on: https://go-review.googlesource.com/c/go/+/684017
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Auto-Submit: Dmitri Shuralyov <dmitshur@google.com>
This commit is contained in:
Michael Anthony Knyszek 2025-06-10 16:42:59 +00:00 committed by Gopher Robot
parent e038690847
commit 21b488bb60
2 changed files with 50 additions and 5 deletions

View file

@ -1503,11 +1503,6 @@ func tryRecordGoroutineProfile(gp1 *g, pcbuf []uintptr, yield func()) {
// so here we check _Gdead first. // so here we check _Gdead first.
return return
} }
if isSystemGoroutine(gp1, true) {
// System goroutines should not appear in the profile. (The finalizer
// goroutine is marked as "already profiled".)
return
}
for { for {
prev := gp1.goroutineProfiled.Load() prev := gp1.goroutineProfiled.Load()
@ -1545,6 +1540,17 @@ func tryRecordGoroutineProfile(gp1 *g, pcbuf []uintptr, yield func()) {
// stack), or from the scheduler in preparation to execute gp1 (running on the // stack), or from the scheduler in preparation to execute gp1 (running on the
// system stack). // system stack).
func doRecordGoroutineProfile(gp1 *g, pcbuf []uintptr) { func doRecordGoroutineProfile(gp1 *g, pcbuf []uintptr) {
if isSystemGoroutine(gp1, false) {
// System goroutines should not appear in the profile.
// Check this here and not in tryRecordGoroutineProfile because isSystemGoroutine
// may change on a goroutine while it is executing, so while the scheduler might
// see a system goroutine, goroutineProfileWithLabelsConcurrent might not, and
// this inconsistency could cause invariants to be violated, such as trying to
// record the stack of a running goroutine below. In short, we still want system
// goroutines to participate in the same state machine on gp1.goroutineProfiled as
// everything else, we just don't record the stack in the profile.
return
}
if readgstatus(gp1) == _Grunning { if readgstatus(gp1) == _Grunning {
print("doRecordGoroutineProfile gp1=", gp1.goid, "\n") print("doRecordGoroutineProfile gp1=", gp1.goid, "\n")
throw("cannot read stack of running goroutine") throw("cannot read stack of running goroutine")

View file

@ -1808,6 +1808,45 @@ func TestGoroutineProfileCoro(t *testing.T) {
goroutineProf.WriteTo(io.Discard, 1) goroutineProf.WriteTo(io.Discard, 1)
} }
// This test tries to provoke a situation wherein the finalizer goroutine is
// erroneously inspected by the goroutine profiler in such a way that could
// cause a crash. See go.dev/issue/74090.
func TestGoroutineProfileIssue74090(t *testing.T) {
testenv.MustHaveParallelism(t)
goroutineProf := Lookup("goroutine")
// T is a pointer type so it won't be allocated by the tiny
// allocator, which can lead to its finalizer not being called
// during this test.
type T *byte
for range 10 {
// We use finalizers for this test because finalizers transition between
// system and user goroutine on each call, since there's substantially
// more work to do to set up a finalizer call. Cleanups, on the other hand,
// transition once for a whole batch, and so are less likely to trigger
// the failure. Under stress testing conditions this test fails approximately
// 5 times every 1000 executions on a 64 core machine without the appropriate
// fix, which is not ideal but if this test crashes at all, it's a clear
// signal that something is broken.
var objs []*T
for range 10000 {
obj := new(T)
runtime.SetFinalizer(obj, func(_ interface{}) {})
objs = append(objs, obj)
}
objs = nil
// Queue up all the finalizers.
runtime.GC()
// Try to run a goroutine profile concurrently with finalizer execution
// to trigger the bug.
var w strings.Builder
goroutineProf.WriteTo(&w, 1)
}
}
func BenchmarkGoroutine(b *testing.B) { func BenchmarkGoroutine(b *testing.B) {
withIdle := func(n int, fn func(b *testing.B)) func(b *testing.B) { withIdle := func(n int, fn func(b *testing.B)) func(b *testing.B) {
return func(b *testing.B) { return func(b *testing.B) {