runtime,runtime/pprof: clean up goroutine leak profile writing

Cleaned up goroutine leak profile extraction:
- removed the acquisition of goroutineProfile semaphore
- inlined the call to saveg when recording stacks instead of using
doRecordGoroutineProfile, which had side-effects over
goroutineProfile fields.

Added regression tests for goroutine leak profiling frontend for binary
and debug=1 profile formats.
Added stress tests for concurrent goroutine and goroutine leak profile requests.

Change-Id: I55c1bcef11e9a7fb7699b4c5a2353e594d3e7173
GitHub-Last-Rev: 5e9eb3b1d8
GitHub-Pull-Request: golang/go#76045
Reviewed-on: https://go-review.googlesource.com/c/go/+/714580
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
This commit is contained in:
Vlad Saioc 2025-11-06 09:53:28 +00:00 committed by Michael Knyszek
parent b8b84b789e
commit 8873e8bea2
3 changed files with 253 additions and 16 deletions

View file

@ -1261,7 +1261,7 @@ func goroutineProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.P
//go:linkname pprof_goroutineLeakProfileWithLabels //go:linkname pprof_goroutineLeakProfileWithLabels
func pprof_goroutineLeakProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) { func pprof_goroutineLeakProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
return goroutineLeakProfileWithLabelsConcurrent(p, labels) return goroutineLeakProfileWithLabels(p, labels)
} }
// labels may be nil. If labels is non-nil, it must have the same length as p. // labels may be nil. If labels is non-nil, it must have the same length as p.
@ -1323,30 +1323,26 @@ func goroutineLeakProfileWithLabelsConcurrent(p []profilerecord.StackRecord, lab
return work.goroutineLeak.count, false return work.goroutineLeak.count, false
} }
// Use the same semaphore as goroutineProfileWithLabelsConcurrent,
// because ultimately we still use goroutine profiles.
semacquire(&goroutineProfile.sema)
// Unlike in goroutineProfileWithLabelsConcurrent, we don't need to
// save the current goroutine stack, because it is obviously not leaked.
pcbuf := makeProfStack() // see saveg() for explanation pcbuf := makeProfStack() // see saveg() for explanation
// Prepare a profile large enough to store all leaked goroutines. // Prepare a profile large enough to store all leaked goroutines.
n = work.goroutineLeak.count n = work.goroutineLeak.count
if n > len(p) { if n > len(p) {
// There's not enough space in p to store the whole profile, so (per the // There's not enough space in p to store the whole profile, so
// contract of runtime.GoroutineProfile) we're not allowed to write to p // we're not allowed to write to p at all and must return n, false.
// at all and must return n, false.
semrelease(&goroutineProfile.sema)
return n, false return n, false
} }
// Visit each leaked goroutine and try to record its stack. // Visit each leaked goroutine and try to record its stack.
var offset int
forEachGRace(func(gp1 *g) { forEachGRace(func(gp1 *g) {
if readgstatus(gp1) == _Gleaked { if readgstatus(gp1)&^_Gscan == _Gleaked {
doRecordGoroutineProfile(gp1, pcbuf) systemstack(func() { saveg(^uintptr(0), ^uintptr(0), gp1, &p[offset], pcbuf) })
if labels != nil {
labels[offset] = gp1.labels
}
offset++
} }
}) })
@ -1354,7 +1350,6 @@ func goroutineLeakProfileWithLabelsConcurrent(p []profilerecord.StackRecord, lab
raceacquire(unsafe.Pointer(&labelSync)) raceacquire(unsafe.Pointer(&labelSync))
} }
semrelease(&goroutineProfile.sema)
return n, true return n, true
} }

View file

@ -228,6 +228,31 @@ var mutexProfile = &Profile{
write: writeMutex, write: writeMutex,
} }
// goroutineLeakProfileLock ensures that the goroutine leak profile writer observes the
// leaked goroutines discovered during the goroutine leak detection GC cycle
// that was triggered by the profile request.
// This prevents a race condition between the garbage collector and the profile writer
// when multiple profile requests are issued concurrently: the status of leaked goroutines
// is reset to _Gwaiting at the beginning of a leak detection cycle, which may lead the
// profile writer of another concurrent request to produce an incomplete profile.
//
// Example trace:
//
// G1 | GC | G2
// ----------------------+-----------------------------+---------------------
// Request profile | . | .
// . | . | Request profile
// . | [G1] Resets leaked g status | .
// . | [G1] Leaks detected | .
// . | <New cycle> | .
// . | [G2] Resets leaked g status | .
// Write profile | . | .
// . | [G2] Leaks detected | .
// . | . | Write profile
// ----------------------+-----------------------------+---------------------
// Incomplete profile |+++++++++++++++++++++++++++++| Complete profile
var goroutineLeakProfileLock sync.Mutex
func lockProfiles() { func lockProfiles() {
profiles.mu.Lock() profiles.mu.Lock()
if profiles.m == nil { if profiles.m == nil {
@ -763,6 +788,15 @@ func writeGoroutine(w io.Writer, debug int) error {
// writeGoroutineLeak first invokes a GC cycle that performs goroutine leak detection. // writeGoroutineLeak first invokes a GC cycle that performs goroutine leak detection.
// It then writes the goroutine profile, filtering for leaked goroutines. // It then writes the goroutine profile, filtering for leaked goroutines.
func writeGoroutineLeak(w io.Writer, debug int) error { func writeGoroutineLeak(w io.Writer, debug int) error {
// Acquire the goroutine leak detection lock and release
// it after the goroutine leak profile is written.
//
// While the critical section is long, this is needed to prevent
// a race condition between the garbage collector and the goroutine
// leak profile writer when multiple profile requests are issued concurrently.
goroutineLeakProfileLock.Lock()
defer goroutineLeakProfileLock.Unlock()
// Run the GC with leak detection first so that leaked goroutines // Run the GC with leak detection first so that leaked goroutines
// may transition to the leaked state. // may transition to the leaked state.
runtime_goroutineLeakGC() runtime_goroutineLeakGC()

View file

@ -774,7 +774,11 @@ func TestMorestack(t *testing.T) {
for { for {
go func() { go func() {
growstack1() growstack1()
c <- true // NOTE(vsaioc): This goroutine may leak without this select.
select {
case c <- true:
case <-time.After(duration):
}
}() }()
select { select {
case <-t: case <-t:
@ -1565,6 +1569,210 @@ func containsCountsLabels(prof *profile.Profile, countLabels map[int64]map[strin
return true return true
} }
func goroutineLeakExample() {
<-make(chan struct{})
panic("unreachable")
}
func TestGoroutineLeakProfileConcurrency(t *testing.T) {
const leakCount = 3
testenv.MustHaveParallelism(t)
regexLeakCount := regexp.MustCompile("goroutineleak profile: total ")
whiteSpace := regexp.MustCompile("\\s+")
// Regular goroutine profile. Used to check that there is no interference between
// the two profile types.
goroutineProf := Lookup("goroutine")
goroutineLeakProf := goroutineLeakProfile
// Check that a profile with debug information contains
includesLeak := func(t *testing.T, name, s string) {
if !strings.Contains(s, "runtime/pprof.goroutineLeakExample") {
t.Errorf("%s profile does not contain expected leaked goroutine (runtime/pprof.goroutineLeakExample): %s", name, s)
}
}
checkFrame := func(i int, j int, locations []*profile.Location, expectedFunctionName string) {
if len(locations) <= i {
t.Errorf("leaked goroutine stack locations out of range at %d of %d", i+1, len(locations))
return
}
location := locations[i]
if len(location.Line) <= j {
t.Errorf("leaked goroutine stack location lines out of range at %d of %d", j+1, len(location.Line))
return
}
if location.Line[j].Function.Name != expectedFunctionName {
t.Errorf("leaked goroutine stack expected %s as the location[%d].Line[%d] but found %s (%s:%d)", expectedFunctionName, i, j, location.Line[j].Function.Name, location.Line[j].Function.Filename, location.Line[j].Line)
}
}
// We use this helper to count the total number of leaked goroutines in the profile.
//
// NOTE(vsaioc): This value should match for the number of leaks produced in this test,
// but other tests could also leak goroutines, in which case we would have a mismatch
// when bulk-running tests.
//
// The two mismatching outcomes are therefore:
// - More leaks than expected, which is a correctness issue with other tests.
// In this case, this test effectively checks other tests wrt
// goroutine leaks during bulk executions (e.g., running all.bash).
//
// - Fewer leaks than expected; this is an unfortunate symptom of scheduling
// non-determinism, which may occur once in a blue moon. We make
// a best-effort attempt to allow the expected leaks to occur, by yielding
// the main thread, but it is never a guarantee.
countLeaks := func(t *testing.T, number int, s string) {
// Strip the profile header
parts := regexLeakCount.Split(s, -1)
if len(parts) < 2 {
t.Fatalf("goroutineleak profile does not contain 'goroutineleak profile: total ': %s\nparts: %v", s, parts)
return
}
parts = whiteSpace.Split(parts[1], -1)
count, err := strconv.ParseInt(parts[0], 10, 64)
if err != nil {
t.Fatalf("goroutineleak profile count is not a number: %s\nerror: %v", s, err)
}
// Check that the total number of leaked goroutines is exactly the expected number.
if count != int64(number) {
t.Errorf("goroutineleak profile does not contain exactly %d leaked goroutines: %d", number, count)
}
}
checkLeakStack := func(t *testing.T) func(pc uintptr, locations []*profile.Location, _ map[string][]string) {
return func(pc uintptr, locations []*profile.Location, _ map[string][]string) {
if pc != leakCount {
t.Errorf("expected %d leaked goroutines with specific stack configurations, but found %d", leakCount, pc)
return
}
switch len(locations) {
case 4:
// We expect a receive operation. This is the typical stack.
checkFrame(0, 0, locations, "runtime.gopark")
checkFrame(1, 0, locations, "runtime.chanrecv")
checkFrame(2, 0, locations, "runtime.chanrecv1")
switch len(locations[3].Line) {
case 2:
// Running `go func() { goroutineLeakExample() }()` will produce a stack with 2 lines.
// The anonymous function will have the call to goroutineLeakExample inlined.
checkFrame(3, 1, locations, "runtime/pprof.TestGoroutineLeakProfileConcurrency.func5")
fallthrough
case 1:
// Running `go goroutineLeakExample()` will produce a stack with 1 line.
checkFrame(3, 0, locations, "runtime/pprof.goroutineLeakExample")
default:
t.Errorf("leaked goroutine stack location expected 1 or 2 lines in the 4th location but found %d", len(locations[3].Line))
return
}
default:
message := fmt.Sprintf("leaked goroutine stack expected 4 or 5 locations but found %d", len(locations))
for _, location := range locations {
for _, line := range location.Line {
message += fmt.Sprintf("\n%s:%d", line.Function.Name, line.Line)
}
}
t.Errorf("%s", message)
}
}
}
// Leak some goroutines that will feature in the goroutine leak profile
for i := 0; i < leakCount; i++ {
go goroutineLeakExample()
go func() {
// Leak another goroutine that will feature a slightly different stack.
// This includes the frame runtime/pprof.TestGoroutineLeakProfileConcurrency.func1.
goroutineLeakExample()
panic("unreachable")
}()
// Yield several times to allow the goroutines to leak.
runtime.Gosched()
runtime.Gosched()
}
// Give all goroutines a chance to leak.
time.Sleep(time.Second)
t.Run("profile contains leak", func(t *testing.T) {
var w strings.Builder
goroutineLeakProf.WriteTo(&w, 0)
parseProfile(t, []byte(w.String()), checkLeakStack(t))
})
t.Run("leak persists between sequential profiling runs", func(t *testing.T) {
for i := 0; i < 2; i++ {
var w strings.Builder
goroutineLeakProf.WriteTo(&w, 0)
parseProfile(t, []byte(w.String()), checkLeakStack(t))
}
})
// Concurrent calls to the goroutine leak profiler should not trigger data races
// or corruption.
t.Run("overlapping profile requests", func(t *testing.T) {
ctx := context.Background()
ctx, cancel := context.WithTimeout(ctx, time.Second)
defer cancel()
var wg sync.WaitGroup
for i := 0; i < 2; i++ {
wg.Add(1)
Do(ctx, Labels("i", fmt.Sprint(i)), func(context.Context) {
go func() {
defer wg.Done()
for ctx.Err() == nil {
var w strings.Builder
goroutineLeakProf.WriteTo(&w, 1)
countLeaks(t, 2*leakCount, w.String())
includesLeak(t, "goroutineleak", w.String())
}
}()
})
}
wg.Wait()
})
// Concurrent calls to the goroutine leak profiler should not trigger data races
// or corruption, or interfere with regular goroutine profiles.
t.Run("overlapping goroutine and goroutine leak profile requests", func(t *testing.T) {
ctx := context.Background()
ctx, cancel := context.WithTimeout(ctx, time.Second)
defer cancel()
var wg sync.WaitGroup
for i := 0; i < 2; i++ {
wg.Add(2)
Do(ctx, Labels("i", fmt.Sprint(i)), func(context.Context) {
go func() {
defer wg.Done()
for ctx.Err() == nil {
var w strings.Builder
goroutineLeakProf.WriteTo(&w, 1)
countLeaks(t, 2*leakCount, w.String())
includesLeak(t, "goroutineleak", w.String())
}
}()
go func() {
defer wg.Done()
for ctx.Err() == nil {
var w strings.Builder
goroutineProf.WriteTo(&w, 1)
// The regular goroutine profile should see the leaked
// goroutines. We simply check that the goroutine leak
// profile does not corrupt the goroutine profile state.
includesLeak(t, "goroutine", w.String())
}
}()
})
}
wg.Wait()
})
}
func TestGoroutineProfileConcurrency(t *testing.T) { func TestGoroutineProfileConcurrency(t *testing.T) {
testenv.MustHaveParallelism(t) testenv.MustHaveParallelism(t)