runtime: select GC mark workers during start-the-world

When the GC starts today, procresize and startTheWorldWithSema don't
consider the additional Ps required to run the mark workers. procresize
and startTheWorldWithSema resume only the Ps necessary to run the normal
user goroutines.

Once those Ps start, findRunnable and findRunnableGCWorker determine
that a GC worker is necessary and run the worker instead, calling wakep
to wake another P to run the original user goroutine.

This is unfortunate because it disrupts the intentional placement of Ps
on Ms that procresize does. It also has the unfortunate side effect of
slightly delaying start-the-world time, as it takes several sequential
wakeps to get all Ps started.

To address this, procresize explicitly assigns GC mark workers to Ps
before starting the world. The assignment occurs _after_ selecting
runnable Ps, so that we prefer to select Ps that were previously idle.

Note that if fewer than 25% of Ps are idle then we won't be able to
assign all dedicated workers, and some of the Ps intended for user
goroutines will convert to dedicated workers once they reach
findRunnableGCWorker.

Also note that stack scanning temporarily suspends the goroutine. Resume
occurs through ready, which will move the goroutine to the local runq of
the P that did the scan. Thus there is still a source of migration at
some point during the GC.

For #65694.

Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64-longtest
Change-Id: I6a6a636c51f39f4f4bc716aa87de68f6ebe163a5
Reviewed-on: https://go-review.googlesource.com/c/go/+/721002
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Michael Pratt <mpratt@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
This commit is contained in:
Michael Pratt 2025-11-17 16:08:21 -05:00 committed by Gopher Robot
parent 829779f4fe
commit a18aff8057
4 changed files with 236 additions and 13 deletions

View file

@ -870,9 +870,12 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
gcCPULimiter.update(now) gcCPULimiter.update(now)
} }
ok, now := c.assignWaitingGCWorker(pp, now) // If a worker wasn't already assigned by procresize, assign one now.
if !ok { if pp.nextGCMarkWorker == nil {
return nil, now ok, now := c.assignWaitingGCWorker(pp, now)
if !ok {
return nil, now
}
} }
node := pp.nextGCMarkWorker node := pp.nextGCMarkWorker

View file

@ -4135,11 +4135,23 @@ top:
gp, inheritTime, tryWakeP := findRunnable() // blocks until work is available gp, inheritTime, tryWakeP := findRunnable() // blocks until work is available
// May be on a new P.
pp = mp.p.ptr()
// findRunnable may have collected an allp snapshot. The snapshot is // findRunnable may have collected an allp snapshot. The snapshot is
// only required within findRunnable. Clear it to all GC to collect the // only required within findRunnable. Clear it to all GC to collect the
// slice. // slice.
mp.clearAllpSnapshot() mp.clearAllpSnapshot()
// If the P was assigned a next GC mark worker but findRunnable
// selected anything else, release the worker so another P may run it.
//
// N.B. If this occurs because a higher-priority goroutine was selected
// (trace reader), then tryWakeP is set, which will wake another P to
// run the worker. If this occurs because the GC is no longer active,
// there is no need to wakep.
gcController.releaseNextGCMarkWorker(pp)
if debug.dontfreezetheworld > 0 && freezing.Load() { if debug.dontfreezetheworld > 0 && freezing.Load() {
// See comment in freezetheworld. We don't want to perturb // See comment in freezetheworld. We don't want to perturb
// scheduler state, so we didn't gcstopm in findRunnable, but // scheduler state, so we didn't gcstopm in findRunnable, but
@ -6036,8 +6048,10 @@ func procresize(nprocs int32) *p {
unlock(&allpLock) unlock(&allpLock)
} }
// Assign Ms to Ps with runnable goroutines.
var runnablePs *p var runnablePs *p
var runnablePsNeedM *p var runnablePsNeedM *p
var idlePs *p
for i := nprocs - 1; i >= 0; i-- { for i := nprocs - 1; i >= 0; i-- {
pp := allp[i] pp := allp[i]
if gp.m.p.ptr() == pp { if gp.m.p.ptr() == pp {
@ -6045,7 +6059,8 @@ func procresize(nprocs int32) *p {
} }
pp.status = _Pidle pp.status = _Pidle
if runqempty(pp) { if runqempty(pp) {
pidleput(pp, now) pp.link.set(idlePs)
idlePs = pp
continue continue
} }
@ -6071,6 +6086,8 @@ func procresize(nprocs int32) *p {
pp.link.set(runnablePs) pp.link.set(runnablePs)
runnablePs = pp runnablePs = pp
} }
// Assign Ms to remaining runnable Ps without usable oldm. See comment
// above.
for runnablePsNeedM != nil { for runnablePsNeedM != nil {
pp := runnablePsNeedM pp := runnablePsNeedM
runnablePsNeedM = pp.link.ptr() runnablePsNeedM = pp.link.ptr()
@ -6081,6 +6098,62 @@ func procresize(nprocs int32) *p {
runnablePs = pp runnablePs = pp
} }
// Now that we've assigned Ms to Ps with runnable goroutines, assign GC
// mark workers to remaining idle Ps, if needed.
//
// By assigning GC workers to Ps here, we slightly speed up starting
// the world, as we will start enough Ps to run all of the user
// goroutines and GC mark workers all at once, rather than using a
// sequence of wakep calls as each P's findRunnable realizes it needs
// to run a mark worker instead of a user goroutine.
//
// By assigning GC workers to Ps only _after_ previously-running Ps are
// assigned Ms, we ensure that goroutines previously running on a P
// continue to run on the same P, with GC mark workers preferring
// previously-idle Ps. This helps prevent goroutines from shuffling
// around too much across STW.
//
// N.B., if there aren't enough Ps left in idlePs for all of the GC
// mark workers, then findRunnable will still choose to run mark
// workers on Ps assigned above.
//
// N.B., we do this during any STW in the mark phase, not just the
// sweep termination STW that starts the mark phase. gcBgMarkWorker
// always preempts by removing itself from the P, so even unrelated
// STWs during the mark require that Ps reselect mark workers upon
// restart.
if gcBlackenEnabled != 0 {
for idlePs != nil {
pp := idlePs
ok, _ := gcController.assignWaitingGCWorker(pp, now)
if !ok {
// No more mark workers needed.
break
}
// Got a worker, P is now runnable.
//
// mget may return nil if there aren't enough Ms, in
// which case startTheWorldWithSema will start one.
//
// N.B. findRunnableGCWorker will make the worker G
// itself runnable.
idlePs = pp.link.ptr()
mp := mget()
pp.m.set(mp)
pp.link.set(runnablePs)
runnablePs = pp
}
}
// Finally, any remaining Ps are truly idle.
for idlePs != nil {
pp := idlePs
idlePs = pp.link.ptr()
pidleput(pp, now)
}
stealOrder.reset(uint32(nprocs)) stealOrder.reset(uint32(nprocs))
var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32 var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32
atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs)) atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs))
@ -6183,6 +6256,10 @@ func releasepNoTrace() *p {
print("releasep: m=", gp.m, " m->p=", gp.m.p.ptr(), " p->m=", hex(pp.m), " p->status=", pp.status, "\n") print("releasep: m=", gp.m, " m->p=", gp.m.p.ptr(), " p->m=", hex(pp.m), " p->status=", pp.status, "\n")
throw("releasep: invalid p state") throw("releasep: invalid p state")
} }
// P must clear if nextGCMarkWorker if it stops.
gcController.releaseNextGCMarkWorker(pp)
gp.m.p = 0 gp.m.p = 0
pp.m = 0 pp.m = 0
pp.status = _Pidle pp.status = _Pidle

View file

@ -1221,7 +1221,7 @@ func TestTraceSTW(t *testing.T) {
var errors int var errors int
for i := range runs { for i := range runs {
err := runTestTracesSTW(t, i) err := runTestTracesSTW(t, i, "TraceSTW", "stop-the-world (read mem stats)")
if err != nil { if err != nil {
t.Logf("Run %d failed: %v", i, err) t.Logf("Run %d failed: %v", i, err)
errors++ errors++
@ -1235,7 +1235,43 @@ func TestTraceSTW(t *testing.T) {
} }
} }
func runTestTracesSTW(t *testing.T, run int) (err error) { // TestTraceGCSTW verifies that goroutines continue running on the same M and P
// after a GC STW.
func TestTraceGCSTW(t *testing.T) {
// Very similar to TestTraceSTW, but using a STW that starts the GC.
// When the GC starts, the background GC mark workers start running,
// which provide an additional source of disturbance to the scheduler.
//
// procresize assigns GC workers to previously-idle Ps to avoid
// changing what the previously-running Ps are doing.
if testing.Short() {
t.Skip("skipping in -short mode")
}
if runtime.NumCPU() < 8 {
t.Skip("This test sets GOMAXPROCS=8 and wants to avoid thread descheduling as much as possible. Skip on machines with less than 8 CPUs")
}
const runs = 50
var errors int
for i := range runs {
err := runTestTracesSTW(t, i, "TraceGCSTW", "stop-the-world (GC sweep termination)")
if err != nil {
t.Logf("Run %d failed: %v", i, err)
errors++
}
}
pct := float64(errors)/float64(runs)
t.Logf("Errors: %d/%d = %f%%", errors, runs, 100*pct)
if pct > 0.25 {
t.Errorf("Error rate too high")
}
}
func runTestTracesSTW(t *testing.T, run int, name, stwType string) (err error) {
t.Logf("Run %d", run) t.Logf("Run %d", run)
// By default, TSAN sleeps for 1s at exit to allow background // By default, TSAN sleeps for 1s at exit to allow background
@ -1243,7 +1279,7 @@ func runTestTracesSTW(t *testing.T, run int) (err error) {
// much, since we are running 50 iterations, so disable the sleep. // much, since we are running 50 iterations, so disable the sleep.
// //
// Outside of race mode, GORACE does nothing. // Outside of race mode, GORACE does nothing.
buf := []byte(runTestProg(t, "testprog", "TraceSTW", "GORACE=atexit_sleep_ms=0")) buf := []byte(runTestProg(t, "testprog", name, "GORACE=atexit_sleep_ms=0"))
// We locally "fail" the run (return an error) if the trace exhibits // We locally "fail" the run (return an error) if the trace exhibits
// unwanted scheduling. i.e., the target goroutines did not remain on // unwanted scheduling. i.e., the target goroutines did not remain on
@ -1253,7 +1289,7 @@ func runTestTracesSTW(t *testing.T, run int) (err error) {
// occur, such as a trace parse error. // occur, such as a trace parse error.
defer func() { defer func() {
if err != nil || t.Failed() { if err != nil || t.Failed() {
testtrace.Dump(t, fmt.Sprintf("TestTraceSTW-run%d", run), []byte(buf), false) testtrace.Dump(t, fmt.Sprintf("Test%s-run%d", name, run), []byte(buf), false)
} }
}() }()
@ -1509,12 +1545,10 @@ findEnd:
break findEnd break findEnd
case trace.EventRangeBegin: case trace.EventRangeBegin:
r := ev.Range() r := ev.Range()
if r.Name == "stop-the-world (read mem stats)" { if r.Name == stwType {
// Note when we see the STW begin. This is not // Note when we see the STW begin. This is not
// load bearing; it's purpose is simply to fail // load bearing; it's purpose is simply to fail
// the test if we manage to remove the STW from // the test if we accidentally remove the STW.
// ReadMemStat, so we remember to change this
// test to add some new source of STW.
stwSeen = true stwSeen = true
} }
} }

View file

@ -7,15 +7,18 @@ package main
import ( import (
"context" "context"
"log" "log"
"math/rand/v2"
"os" "os"
"runtime" "runtime"
"runtime/debug" "runtime/debug"
"runtime/metrics"
"runtime/trace" "runtime/trace"
"sync/atomic" "sync/atomic"
) )
func init() { func init() {
register("TraceSTW", TraceSTW) register("TraceSTW", TraceSTW)
register("TraceGCSTW", TraceGCSTW)
} }
// The parent writes to ping and waits for the children to write back // The parent writes to ping and waits for the children to write back
@ -53,7 +56,7 @@ func TraceSTW() {
// https://go.dev/issue/65694). Alternatively, we could just ignore the // https://go.dev/issue/65694). Alternatively, we could just ignore the
// trace if the GC runs. // trace if the GC runs.
runtime.GOMAXPROCS(4) runtime.GOMAXPROCS(4)
debug.SetGCPercent(0) debug.SetGCPercent(-1)
if err := trace.Start(os.Stdout); err != nil { if err := trace.Start(os.Stdout); err != nil {
log.Fatalf("failed to start tracing: %v", err) log.Fatalf("failed to start tracing: %v", err)
@ -86,6 +89,112 @@ func TraceSTW() {
stop.Store(true) stop.Store(true)
} }
// Variant of TraceSTW for GC STWs. We want the GC mark workers to start on
// previously-idle Ps, rather than bumping the current P.
func TraceGCSTW() {
ctx := context.Background()
// The idea here is to have 2 target goroutines that are constantly
// running. When the world restarts after STW, we expect these
// goroutines to continue execution on the same M and P.
//
// Set GOMAXPROCS=8 to make room for the 2 target goroutines, 1 parent,
// 2 dedicated workers, and a bit of slack.
//
// Disable the GC initially so we can be sure it only triggers once we
// are ready.
runtime.GOMAXPROCS(8)
debug.SetGCPercent(-1)
if err := trace.Start(os.Stdout); err != nil {
log.Fatalf("failed to start tracing: %v", err)
}
defer trace.Stop()
for i := range 2 {
go traceSTWTarget(i)
}
// Wait for children to start running.
ping.Store(1)
for pong[0].Load() != 1 {}
for pong[1].Load() != 1 {}
trace.Log(ctx, "TraceSTW", "start")
// STW
triggerGC()
// Make sure to run long enough for the children to schedule again
// after STW. This is included for good measure, but the goroutines
// really ought to have already scheduled since the entire GC
// completed.
ping.Store(2)
for pong[0].Load() != 2 {}
for pong[1].Load() != 2 {}
trace.Log(ctx, "TraceSTW", "end")
stop.Store(true)
}
func triggerGC() {
// Allocate a bunch to trigger the GC rather than using runtime.GC. The
// latter blocks until the GC is complete, which is convenient, but
// messes with scheduling as it gives this P a chance to steal the
// other goroutines before their Ps get up and running again.
// Bring heap size up prior to enabling the GC to ensure that there is
// a decent amount of work in case the GC triggers immediately upon
// re-enabling.
for range 1000 {
alloc()
}
sample := make([]metrics.Sample, 1)
sample[0].Name = "/gc/cycles/total:gc-cycles"
metrics.Read(sample)
start := sample[0].Value.Uint64()
debug.SetGCPercent(100)
// Keep allocating until the GC is complete. We really only need to
// continue until the mark workers are scheduled, but there isn't a
// good way to measure that.
for {
metrics.Read(sample)
if sample[0].Value.Uint64() != start {
return
}
alloc()
}
}
// Allocate a tree data structure to generate plenty of scan work for the GC.
type node struct {
children []*node
}
var gcSink node
func alloc() {
// 10% chance of adding a node a each layer.
curr := &gcSink
for {
if len(curr.children) == 0 || rand.Float32() < 0.1 {
curr.children = append(curr.children, new(node))
return
}
i := rand.IntN(len(curr.children))
curr = curr.children[i]
}
}
// Manually insert a morestack call. Leaf functions can omit morestack, but // Manually insert a morestack call. Leaf functions can omit morestack, but
// non-leaf functions should include them. // non-leaf functions should include them.