runtime: make TestCPUMetricsSleep even more lenient

This test was introduced as a regression test for #60276. However, it
was quite flaky on a number of different platforms because there are
myriad ways the runtime can eat into time one might expect is completely
idle.

This change re-enables the test, but makes it much more resilient.
Because the issue we're testing for is persistent, we now require 10
consecutive failures to count. Any single success counts as a test
success. This change also makes the test's idle time bound more lenient,
allowing for a little bit of time to be eaten up. The regression we're
testing for results in nearly zero idle time being accounted for.

If this is still not good enough to eliminate flakes, this test should
just be deleted.

For #60276.
Fixes #60376.

Change-Id: Icd81f0c9970821b7f386f6d27c8a566fee4d0ff7
Reviewed-on: https://go-review.googlesource.com/c/go/+/498274
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
This commit is contained in:
Michael Anthony Knyszek 2023-05-26 17:56:39 +00:00 committed by Michael Knyszek
parent d59926ec7e
commit 125c2cac64

View file

@ -5,7 +5,6 @@
package runtime_test package runtime_test
import ( import (
"internal/testenv"
"reflect" "reflect"
"runtime" "runtime"
"runtime/debug" "runtime/debug"
@ -661,17 +660,6 @@ func TestCPUMetricsSleep(t *testing.T) {
t.Skip("wasip1 currently busy-waits in idle time; test not applicable") t.Skip("wasip1 currently busy-waits in idle time; test not applicable")
} }
// Unconditionally skip this test as flaky.
//
// There's a fundamental issue with this test, which is that there's no
// guarantee the application will go idle; background goroutines and
// time spent in the scheduler going to sleep can always erode idle time
// sufficiently such that minimum idle time (or maximum user time) stays
// within some threshold.
//
// Leave this as skipped while we figure out a better way to check this.
testenv.SkipFlaky(t, 60376)
names := []string{ names := []string{
"/cpu/classes/idle:cpu-seconds", "/cpu/classes/idle:cpu-seconds",
@ -695,33 +683,70 @@ func TestCPUMetricsSleep(t *testing.T) {
} }
m1, m2 := prep(), prep() m1, m2 := prep(), prep()
const (
// Expected time spent idle.
dur = 100 * time.Millisecond
// maxFailures is the number of consecutive failures requires to cause the test to fail.
maxFailures = 10
)
failureIdleTimes := make([]float64, 0, maxFailures)
// If the bug we expect is happening, then the Sleep CPU time will be accounted for
// as user time rather than idle time. In an ideal world we'd expect the whole application
// to go instantly idle the moment this goroutine goes to sleep, and stay asleep for that
// duration. However, the Go runtime can easily eat into idle time while this goroutine is
// blocked in a sleep. For example, slow platforms might spend more time expected in the
// scheduler. Another example is that a Go runtime background goroutine could run while
// everything else is idle. Lastly, if a running goroutine is descheduled by the OS, enough
// time may pass such that the goroutine is ready to wake, even though the runtime couldn't
// observe itself as idle with nanotime.
//
// To deal with all this, we give a half-proc's worth of leniency.
//
// We also retry multiple times to deal with the fact that the OS might deschedule us before
// we yield and go idle. That has a rare enough chance that retries should resolve it.
// If the issue we expect is happening, it should be persistent.
minIdleCPUSeconds := dur.Seconds() * (float64(runtime.GOMAXPROCS(-1)) - 0.5)
// Let's make sure there's no background scavenge work to do.
//
// The runtime.GC calls below ensure the background sweeper
// will not run during the idle period.
debug.FreeOSMemory()
for retries := 0; retries < maxFailures; retries++ {
// Read 1. // Read 1.
runtime.GC() // Update /cpu/classes metrics. runtime.GC() // Update /cpu/classes metrics.
metrics.Read(m1) metrics.Read(m1)
// Sleep. // Sleep.
const dur = 100 * time.Millisecond
time.Sleep(dur) time.Sleep(dur)
// Read 2. // Read 2.
runtime.GC() // Update /cpu/classes metrics. runtime.GC() // Update /cpu/classes metrics.
metrics.Read(m2) metrics.Read(m2)
// If the bug we expect is happening, then the Sleep CPU time will be accounted for dt := m2[0].Value.Float64() - m1[0].Value.Float64()
// as user time rather than idle time. if dt >= minIdleCPUSeconds {
// // All is well. Test passed.
// TODO(mknyszek): This number here is wrong. Background goroutines and just slow return
// platforms spending a non-trivial amount of time in the scheduler doing things }
// could easily erode idle time beyond this minimum. failureIdleTimes = append(failureIdleTimes, dt)
minIdleCPUSeconds := dur.Seconds() * float64(runtime.GOMAXPROCS(-1)) // Try again.
}
if dt := m2[0].Value.Float64() - m1[0].Value.Float64(); dt < minIdleCPUSeconds { // We couldn't observe the expected idle time even once.
for i, dt := range failureIdleTimes {
t.Logf("try %2d: idle time = %.5fs\n", i+1, dt)
}
t.Logf("try %d breakdown:\n", len(failureIdleTimes))
for i := range names { for i := range names {
if m1[i].Value.Kind() == metrics.KindBad { if m1[i].Value.Kind() == metrics.KindBad {
continue continue
} }
t.Logf("%s %0.3f\n", names[i], m2[i].Value.Float64()-m1[i].Value.Float64()) t.Logf("\t%s %0.3f\n", names[i], m2[i].Value.Float64()-m1[i].Value.Float64())
}
t.Errorf(`time.Sleep did not contribute enough to "idle" class: %.5fs < %.5fs`, dt, minIdleCPUSeconds)
} }
t.Errorf(`time.Sleep did not contribute enough to "idle" class: minimum idle time = %.5fs`, minIdleCPUSeconds)
} }