diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go index 2bff44c4668..75ad318b07b 100644 --- a/src/runtime/metrics.go +++ b/src/runtime/metrics.go @@ -165,6 +165,12 @@ func initMetrics() { out.scalar = uint64(in.heapStats.tinyAllocCount) }, }, + "/gc/limiter/last-enabled:gc-cycle": { + compute: func(_ *statAggregate, out *metricValue) { + out.kind = metricKindUint64 + out.scalar = uint64(gcCPULimiter.lastEnabledCycle.Load()) + }, + }, "/gc/pauses:seconds": { compute: func(_ *statAggregate, out *metricValue) { hist := out.float64HistOrInit(timeHistBuckets) diff --git a/src/runtime/metrics/description.go b/src/runtime/metrics/description.go index a33d9a2c35a..ee99d3938df 100644 --- a/src/runtime/metrics/description.go +++ b/src/runtime/metrics/description.go @@ -140,6 +140,15 @@ var allDesc = []Description{ Kind: KindUint64, Cumulative: true, }, + { + Name: "/gc/limiter/last-enabled:gc-cycle", + Description: "GC cycle the last time the GC CPU limiter was enabled. " + + "This metric is useful for diagnosing the root cause of an out-of-memory " + + "error, because the limiter trades memory for CPU time when the GC's CPU " + + "time gets too high. This is most likely to occur with use of SetMemoryLimit. " + + "The first GC cycle is cycle 1, so a value of 0 indicates that it was never enabled.", + Kind: KindUint64, + }, { Name: "/gc/pauses:seconds", Description: "Distribution individual GC-related stop-the-world pause latencies.", diff --git a/src/runtime/metrics/doc.go b/src/runtime/metrics/doc.go index b4d99f72bb5..28c9f6abb5e 100644 --- a/src/runtime/metrics/doc.go +++ b/src/runtime/metrics/doc.go @@ -102,6 +102,13 @@ Below is the full list of supported metrics, ordered lexicographically. only their block. Each block is already accounted for in allocs-by-size and frees-by-size. + /gc/limiter/last-enabled:gc-cycle + GC cycle the last time the GC CPU limiter was enabled. + This metric is useful for diagnosing the root cause of an out-of-memory + error, because the limiter trades memory for CPU time when the GC's CPU + time gets too high. This is most likely to occur with use of SetMemoryLimit. + The first GC cycle is cycle 1, so a value of 0 indicates that it was never enabled. + /gc/pauses:seconds Distribution individual GC-related stop-the-world pause latencies. diff --git a/src/runtime/mgclimit.go b/src/runtime/mgclimit.go index b930af3340e..cbe5500be61 100644 --- a/src/runtime/mgclimit.go +++ b/src/runtime/mgclimit.go @@ -40,8 +40,8 @@ type gcCPULimiterState struct { // - fill <= capacity fill, capacity uint64 } - // TODO(mknyszek): Export this as a runtime/metric to provide an estimate of - // how much GC work is being dropped on the floor. + // overflow is the cumulative amount of GC CPU time that we tried to fill the + // bucket with but exceeded its capacity. overflow uint64 // gcEnabled is an internal copy of gcBlackenEnabled that determines @@ -65,6 +65,9 @@ type gcCPULimiterState struct { // Updated under lock, but may be read concurrently. lastUpdate atomic.Int64 + // lastEnabledCycle is the GC cycle that last had the limiter enabled. + lastEnabledCycle atomic.Uint32 + // nprocs is an internal copy of gomaxprocs, used to determine total available // CPU time. // @@ -203,6 +206,7 @@ func (l *gcCPULimiterState) accumulate(mutatorTime, gcTime int64) { l.bucket.fill = l.bucket.capacity if !enabled { l.enabled.Store(true) + l.lastEnabledCycle.Store(memstats.numgc + 1) } return } @@ -254,6 +258,7 @@ func (l *gcCPULimiterState) resetCapacity(now int64, nprocs int32) { if l.bucket.fill > l.bucket.capacity { l.bucket.fill = l.bucket.capacity l.enabled.Store(true) + l.lastEnabledCycle.Store(memstats.numgc + 1) } else if l.bucket.fill < l.bucket.capacity { l.enabled.Store(false) }