runtime: only use CPU time from the current window in the GC CPU limiter

Currently the GC CPU limiter consumes CPU time from a few pools, but because the events that flush to those pools may overlap, rather than be strictly contained within, the update window for the GC CPU limiter, the limiter's accounting is ultimately sloppy. This sloppiness complicates accounting for idle time more completely, and makes reasoning about the transient behavior of the GC CPU limiter much more difficult. To remedy this, this CL adds a field to the P struct that tracks the start time of any in-flight event the limiter might care about, along with information about the nature of that event. This timestamp is managed atomically so that the GC CPU limiter can come in and perform a read of the partial CPU time consumed by a given event. The limiter also updates the timestamp so that only what's left over is flushed by the event itself when it completes. The end result of this change is that, since the GC CPU limiter is aware of all past completed events, and all in-flight events, it can much more accurately collect the CPU time of events since the last update. There's still the possibility for skew, but any leftover time will be captured in the following update, and the magnitude of this leftover time is effectively bounded by the update period of the GC CPU limiter, which is much easier to consider. One caveat of managing this timestamp-type combo atomically is that they need to be packed in 64 bits. So, this CL gives up the top 3 bits of the timestamp and places the type information there. What this means is we effectively have only a 61-bit resolution timestamp. This is fine when the top 3 bits are the same between calls to nanotime, but becomes a problem on boundaries when those 3 bits change. These cases may cause hiccups in the GC CPU limiter by not accounting for some source of CPU time correctly, but with 61 bits of resolution this should be extremely rare. The rate of update is on the order of milliseconds, so at worst the runtime will be off of any given measurement by only a few CPU-milliseconds (and this is directly bounded by the rate of update). We're probably more inaccurate from the fact that we don't measure real CPU time but only approximate it. For #52890. Change-Id: I347f30ac9e2ba6061806c21dfe0193ef2ab3bbe9 Reviewed-on: https://go-review.googlesource.com/c/go/+/410120 Reviewed-by: Michael Pratt <mpratt@google.com> Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2025-12-08 06:10:04 +00:00 · 2022-06-02 19:06:27 +00:00 · 2022-06-02 19:06:27 +00:00 · d7941030c9
commit d7941030c9
parent 162b88265e
6 changed files with 203 additions and 7 deletions
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@ -1415,6 +1415,7 @@ func NewGCCPULimiter(now int64, gomaxprocs int32) *GCCPULimiter {
 	// on a 32-bit architecture, it may get allocated unaligned
 	// space.
 	l := Escape(new(GCCPULimiter))
 	l.limiter.test = true
 	l.limiter.resetCapacity(now, gomaxprocs)
 	return l
 }
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@ -1281,6 +1281,10 @@ func gcBgMarkWorker() {
 		startTime := nanotime()
 		pp.gcMarkWorkerStartTime = startTime
 		var trackLimiterEvent bool
 		if pp.gcMarkWorkerMode == gcMarkWorkerIdleMode {
 			trackLimiterEvent = pp.limiterEvent.start(limiterEventIdleMarkWork, startTime)
 		}
 		decnwait := atomic.Xadd(&work.nwait, -1)
 		if decnwait == work.nproc {
@ -1329,9 +1333,8 @@ func gcBgMarkWorker() {
 		now := nanotime()
 		duration := now - startTime
 		gcController.markWorkerStop(pp.gcMarkWorkerMode, duration)
-		if pp.gcMarkWorkerMode == gcMarkWorkerIdleMode {
+		if trackLimiterEvent {
-			gcCPULimiter.addIdleMarkTime(duration)
+			pp.limiterEvent.stop(limiterEventIdleMarkWork, now)
 			gcCPULimiter.update(now)
 		}
 		if pp.gcMarkWorkerMode == gcMarkWorkerFractionalMode {
 			atomic.Xaddint64(&pp.gcFractionalMarkTime, duration)
--- a/src/runtime/mgclimit.go
+++ b/src/runtime/mgclimit.go
@ -76,6 +76,9 @@ type gcCPULimiterState struct {
 	//
 	// gomaxprocs isn't used directly so as to keep this structure unit-testable.
 	nprocs int32
 	// test indicates whether this instance of the struct was made for testing purposes.
 	test bool
 }
 // limiting returns true if the CPU limiter is currently enabled, meaning the Go GC
@ -192,6 +195,33 @@ func (l *gcCPULimiterState) updateLocked(now int64) {
 		l.idleMarkTimePool.Add(-idleMarkTime)
 	}
 	if !l.test {
 		// Consume time from in-flight events. Make sure we're not preemptible so allp can't change.
 		//
 		// The reason we do this instead of just waiting for those events to finish and push updates
 		// is to ensure that all the time we're accounting for happened sometime between lastUpdate
 		// and now. This dramatically simplifies reasoning about the limiter because we're not at
 		// risk of extra time being accounted for in this window than actually happened in this window,
 		// leading to all sorts of weird transient behavior.
 		mp := acquirem()
 		for _, pp := range allp {
 			typ, duration := pp.limiterEvent.consume(now)
 			switch typ {
 			case limiterEventIdleMarkWork:
 				idleMarkTime += duration
 			case limiterEventMarkAssist:
 				fallthrough
 			case limiterEventScavengeAssist:
 				assistTime += duration
 			case limiterEventNone:
 				break
 			default:
 				throw("invalid limiter event type found")
 			}
 		}
 		releasem(mp)
 	}
 	// Compute total GC time.
 	windowGCTime := assistTime
 	if l.gcEnabled {
@ -300,3 +330,144 @@ func (l *gcCPULimiterState) resetCapacity(now int64, nprocs int32) {
 	}
 	l.unlock()
 }
 // limiterEventType indicates the type of an event occuring on some P.
 //
 // These events represent the full set of events that the GC CPU limiter tracks
 // to execute its function.
 //
 // This type may use no more than limiterEventBits bits of information.
 type limiterEventType uint8
 const (
 	limiterEventNone           limiterEventType = iota // None of the following events.
 	limiterEventIdleMarkWork                           // Refers to an idle mark worker (see gcMarkWorkerMode).
 	limiterEventMarkAssist                             // Refers to mark assist (see gcAssistAlloc).
 	limiterEventScavengeAssist                         // Refers to a scavenge assist (see allocSpan).
 	limiterEventBits = 3
 )
 // limiterEventTypeMask is a mask for the bits in p.limiterEventStart that represent
 // the event type. The rest of the bits of that field represent a timestamp.
 const (
 	limiterEventTypeMask  = uint64((1<<limiterEventBits)-1) << (64 - limiterEventBits)
 	limiterEventStampNone = limiterEventStamp(0)
 )
 // limiterEventStamp is a nanotime timestamp packed with a limiterEventType.
 type limiterEventStamp uint64
 // makeLimiterEventStamp creates a new stamp from the event type and the current timestamp.
 func makeLimiterEventStamp(typ limiterEventType, now int64) limiterEventStamp {
 	return limiterEventStamp(uint64(typ)<<(64-limiterEventBits) | (uint64(now) &^ limiterEventTypeMask))
 }
 // duration computes the difference between now and the start time stored in the stamp.
 //
 // Returns 0 if the difference is negative, which may happen if now is stale or if the
 // before and after timestamps cross a 2^(64-limiterEventBits) boundary.
 func (s limiterEventStamp) duration(now int64) int64 {
 	// The top limiterEventBits bits of the timestamp are derived from the current time
 	// when computing a duration.
 	start := int64((uint64(now) & limiterEventTypeMask) | (uint64(s) &^ limiterEventTypeMask))
 	if now < start {
 		return 0
 	}
 	return now - start
 }
 // type extracts the event type from the stamp.
 func (s limiterEventStamp) typ() limiterEventType {
 	return limiterEventType(s >> (64 - limiterEventBits))
 }
 // limiterEvent represents tracking state for an event tracked by the GC CPU limiter.
 type limiterEvent struct {
 	stamp atomic.Uint64 // Stores a limiterEventStamp.
 }
 // start begins tracking a new limiter event of the current type. If an event
 // is already in flight, then a new event cannot begin because the current time is
 // already being attributed to that event. In this case, this function returns false.
 // Otherwise, it returns true.
 //
 // The caller must be non-preemptible until at least stop is called or this function
 // returns false. Because this is trying to measure "on-CPU" time of some event, getting
 // scheduled away during it can mean that whatever we're measuring isn't a reflection
 // of "on-CPU" time. The OS could deschedule us at any time, but we want to maintain as
 // close of an approximation as we can.
 func (e *limiterEvent) start(typ limiterEventType, now int64) bool {
 	if limiterEventStamp(e.stamp.Load()).typ() != limiterEventNone {
 		return false
 	}
 	e.stamp.Store(uint64(makeLimiterEventStamp(typ, now)))
 	return true
 }
 // consume acquires the partial event CPU time from any in-flight event.
 // It achieves this by storing the current time as the new event time.
 //
 // Returns the type of the in-flight event, as well as how long it's currently been
 // executing for. Returns limiterEventNone if no event is active.
 func (e *limiterEvent) consume(now int64) (typ limiterEventType, duration int64) {
 	// Read the limiter event timestamp and update it to now.
 	for {
 		old := limiterEventStamp(e.stamp.Load())
 		typ = old.typ()
 		if typ == limiterEventNone {
 			// There's no in-flight event, so just push that up.
 			return
 		}
 		duration = old.duration(now)
 		if duration == 0 {
 			// We might have a stale now value, or this crossed the
 			// 2^(64-limiterEventBits) boundary in the clock readings.
 			// Just ignore it.
 			return limiterEventNone, 0
 		}
 		new := makeLimiterEventStamp(typ, now)
 		if e.stamp.CompareAndSwap(uint64(old), uint64(new)) {
 			break
 		}
 	}
 	return
 }
 // stop stops the active limiter event. Throws if the
 //
 // The caller must be non-preemptible across the event. See start as to why.
 func (e *limiterEvent) stop(typ limiterEventType, now int64) {
 	var stamp limiterEventStamp
 	for {
 		stamp = limiterEventStamp(e.stamp.Load())
 		if stamp.typ() != typ {
 			print("runtime: want=", typ, " got=", stamp.typ(), "\n")
 			throw("limiterEvent.stop: found wrong event in p's limiter event slot")
 		}
 		if e.stamp.CompareAndSwap(uint64(stamp), uint64(limiterEventStampNone)) {
 			break
 		}
 	}
 	duration := stamp.duration(now)
 	if duration == 0 {
 		// It's possible that we're missing time because we crossed a
 		// 2^(64-limiterEventBits) boundary between the start and end.
 		// In this case, we're dropping that information. This is OK because
 		// at worst it'll cause a transient hiccup that will quickly resolve
 		// itself as all new timestamps begin on the other side of the boundary.
 		// Such a hiccup should be incredibly rare.
 		return
 	}
 	// Account for the event.
 	switch typ {
 	case limiterEventIdleMarkWork:
 		gcCPULimiter.addIdleMarkTime(duration)
 	case limiterEventMarkAssist:
 		fallthrough
 	case limiterEventScavengeAssist:
 		gcCPULimiter.addAssistTime(duration)
 	default:
 		throw("limiterEvent.stop: invalid limiter event type found")
 	}
 }
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@ -545,7 +545,11 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
 	// Track time spent in this assist. Since we're on the
 	// system stack, this is non-preemptible, so we can
 	// just measure start and end time.
 	//
 	// Limiter event tracking might be disabled if we end up here
 	// while on a mark worker.
 	startTime := nanotime()
 	trackLimiterEvent := gp.m.p.ptr().limiterEvent.start(limiterEventMarkAssist, startTime)
 	decnwait := atomic.Xadd(&work.nwait, -1)
 	if decnwait == work.nproc {
@ -593,9 +597,11 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
 	duration := now - startTime
 	_p_ := gp.m.p.ptr()
 	_p_.gcAssistTime += duration
 	if trackLimiterEvent {
 		_p_.limiterEvent.stop(limiterEventMarkAssist, now)
 	}
 	if _p_.gcAssistTime > gcAssistTimeSlack {
 		gcController.assistTime.Add(_p_.gcAssistTime)
 		gcCPULimiter.addAssistTime(_p_.gcAssistTime)
 		gcCPULimiter.update(now)
 		_p_.gcAssistTime = 0
 	}
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@ -1301,17 +1301,29 @@ HaveSpan:
 			}
 		}
 	}
-	if bytesToScavenge > 0 {
+	// There are a few very limited cirumstances where we won't have a P here.
 	// It's OK to simply skip scavenging in these cases. Something else will notice
 	// and pick up the tab.
 	if pp != nil && bytesToScavenge > 0 {
 		// Measure how long we spent scavenging and add that measurement to the assist
 		// time so we can track it for the GC CPU limiter.
 		//
 		// Limiter event tracking might be disabled if we end up here
 		// while on a mark worker.
 		start := nanotime()
 		track := pp.limiterEvent.start(limiterEventScavengeAssist, start)
 		// Scavenge, but back out if the limiter turns on.
 		h.pages.scavenge(bytesToScavenge, func() bool {
 			return gcCPULimiter.limiting()
 		})
 		// Finish up accounting.
 		now := nanotime()
 		if track {
 			pp.limiterEvent.stop(limiterEventScavengeAssist, now)
 		}
 		h.pages.scav.assistTime.Add(now - start)
 		gcCPULimiter.addAssistTime(now - start)
 		gcCPULimiter.update(now)
 	}
 	// Commit and account for any scavenged memory that the span now owns.
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@ -686,6 +686,9 @@ type p struct {
 	gcAssistTime         int64 // Nanoseconds in assistAlloc
 	gcFractionalMarkTime int64 // Nanoseconds in fractional mark worker (atomic)
 	// limiterEvent tracks events for the GC CPU limiter.
 	limiterEvent limiterEvent
 	// gcMarkWorkerMode is the mode for the next mark worker to run in.
 	// That is, this is used to communicate with the worker goroutine
 	// selected for immediate execution by