runtime: move GC pause time CPU metrics update into the STW

This change fixes a possible race with updating metrics and reading
them. The update is intended to be protected by the world being stopped,
but here, it clearly isn't.

Fixing this lets us lower the thresholds in the metrics tests by an
order of magnitude, because the only thing we have to worry about now is
floating point error (the tests were previously written assuming the
floating point error was much higher than it actually was; that turns
out not to be the case, and this bug was the problem instead). However,
this still isn't that tight of a bound; we still want to catch any and
all problems of exactness. For this purpose, this CL adds a test to
check the source-of-truth (in uint64 nanoseconds) that ensures the
totals exactly match.

This means we unfortunately have to take another time measurement, but
for now let's prioritize correctness. A few additional nanoseconds of
STW time won't be terribly noticable.

Fixes #66212.

Change-Id: Id02c66e8a43c13b1f70e9b268b8a84cc72293bfd
Reviewed-on: https://go-review.googlesource.com/c/go/+/570257
Auto-Submit: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Nicolas Hillegeer <aktau@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Michael Anthony Knyszek 2024-03-08 23:06:41 +00:00 committed by Gopher Robot
parent 8008998b14
commit 017478a963
5 changed files with 94 additions and 49 deletions

View file

@ -357,11 +357,11 @@ func TestReadMetricsConsistency(t *testing.T) {
if cpu.idle <= 0 {
t.Errorf("found no idle time: %f", cpu.idle)
}
if total := cpu.gcDedicated + cpu.gcAssist + cpu.gcIdle + cpu.gcPause; !withinEpsilon(cpu.gcTotal, total, 0.01) {
t.Errorf("calculated total GC CPU not within 1%% of sampled total: %f vs. %f", total, cpu.gcTotal)
if total := cpu.gcDedicated + cpu.gcAssist + cpu.gcIdle + cpu.gcPause; !withinEpsilon(cpu.gcTotal, total, 0.001) {
t.Errorf("calculated total GC CPU time not within %%0.1 of total: %f vs. %f", total, cpu.gcTotal)
}
if total := cpu.scavengeAssist + cpu.scavengeBg; !withinEpsilon(cpu.scavengeTotal, total, 0.01) {
t.Errorf("calculated total scavenge CPU not within 1%% of sampled total: %f vs. %f", total, cpu.scavengeTotal)
if total := cpu.scavengeAssist + cpu.scavengeBg; !withinEpsilon(cpu.scavengeTotal, total, 0.001) {
t.Errorf("calculated total scavenge CPU not within %%0.1 of total: %f vs. %f", total, cpu.scavengeTotal)
}
if cpu.total <= 0 {
t.Errorf("found no total CPU time passed")
@ -369,8 +369,8 @@ func TestReadMetricsConsistency(t *testing.T) {
if cpu.user <= 0 {
t.Errorf("found no user time passed")
}
if total := cpu.gcTotal + cpu.scavengeTotal + cpu.user + cpu.idle; !withinEpsilon(cpu.total, total, 0.02) {
t.Errorf("calculated total CPU not within 2%% of sampled total: %f vs. %f", total, cpu.total)
if total := cpu.gcTotal + cpu.scavengeTotal + cpu.user + cpu.idle; !withinEpsilon(cpu.total, total, 0.001) {
t.Errorf("calculated total CPU not within %%0.1 of total: %f vs. %f", total, cpu.total)
}
}
if totalVirtual.got != totalVirtual.want {
@ -1290,3 +1290,39 @@ func (w *contentionWorker) run() {
for w.fn() {
}
}
func TestCPUStats(t *testing.T) {
// Run a few GC cycles to get some of the stats to be non-zero.
runtime.GC()
runtime.GC()
runtime.GC()
// Set GOMAXPROCS high then sleep briefly to ensure we generate
// some idle time.
oldmaxprocs := runtime.GOMAXPROCS(10)
time.Sleep(time.Millisecond)
runtime.GOMAXPROCS(oldmaxprocs)
stats := runtime.ReadCPUStats()
gcTotal := stats.GCAssistTime + stats.GCDedicatedTime + stats.GCIdleTime + stats.GCPauseTime
if gcTotal != stats.GCTotalTime {
t.Errorf("manually computed total does not match GCTotalTime: %d cpu-ns vs. %d cpu-ns", gcTotal, stats.GCTotalTime)
}
scavTotal := stats.ScavengeAssistTime + stats.ScavengeBgTime
if scavTotal != stats.ScavengeTotalTime {
t.Errorf("manually computed total does not match ScavengeTotalTime: %d cpu-ns vs. %d cpu-ns", scavTotal, stats.ScavengeTotalTime)
}
total := gcTotal + scavTotal + stats.IdleTime + stats.UserTime
if total != stats.TotalTime {
t.Errorf("manually computed overall total does not match TotalTime: %d cpu-ns vs. %d cpu-ns", total, stats.TotalTime)
}
if total == 0 {
t.Error("total time is zero")
}
if gcTotal == 0 {
t.Error("GC total time is zero")
}
if stats.IdleTime == 0 {
t.Error("idle time is zero")
}
}