go/src/runtime/proc_test.go

1140 lines
25 KiB
Go
Raw Normal View History

// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package runtime_test
import (
"fmt"
"internal/race"
runtime: disable preemption in startTemplateThread When a locked M wants to start a new M, it hands off to the template thread to actually call clone and start the thread. The template thread is lazily created the first time a thread is locked (or if cgo is in use). stoplockedm will release the P (_Pidle), then call handoffp to give the P to another M. In the case of a pending STW, one of two things can happen: 1. handoffp starts an M, which does acquirep followed by schedule, which will finally enter _Pgcstop. 2. handoffp immediately enters _Pgcstop. This only occurs if the P has no local work, GC work, and no spinning M is required. If handoffp starts an M, and must create a new M to do so, then newm will simply queue the M on newmHandoff for the template thread to do the clone. When a stop-the-world is required, stopTheWorldWithSema will start the stop and then wait for all Ps to enter _Pgcstop. If the template thread is not fully created because startTemplateThread gets stopped, then another stoplockedm may queue an M that will never get created, and the handoff P will never leave _Pidle. Thus stopTheWorldWithSema will wait forever. A sequence to trigger this hang when STW occurs can be visualized with two threads: T1 T2 ------------------------------- ----------------------------- LockOSThread LockOSThread haveTemplateThread == 0 startTemplateThread haveTemplateThread = 1 newm haveTemplateThread == 1 preempt -> schedule g.m.lockedExt++ gcstopm -> _Pgcstop g.m.lockedg = ... park g.lockedm = ... return ... (any code) preempt -> schedule stoplockedm releasep -> _Pidle handoffp startm (first 3 handoffp cases) newm g.m.lockedExt != 0 Add to newmHandoff, return park Note that the P in T2 is stuck sitting in _Pidle. Since the template thread isn't running, the new M will not be started complete the transition to _Pgcstop. To resolve this, we disable preemption around the assignment of haveTemplateThread and the creation of the template thread in order to guarantee that if handTemplateThread is set then the template thread will eventually exist, in the presence of stops. Fixes #38931 Change-Id: I50535fbbe2f328f47b18e24d9030136719274191 Reviewed-on: https://go-review.googlesource.com/c/go/+/232978 Run-TryBot: Michael Pratt <mpratt@google.com> Reviewed-by: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2020-05-07 18:13:21 -04:00
"internal/testenv"
"math"
"net"
"runtime"
"runtime/debug"
"strings"
cmd/gc: capture variables by value Language specification says that variables are captured by reference. And that is what gc compiler does. However, in lots of cases it is possible to capture variables by value under the hood without affecting visible behavior of programs. For example, consider the following typical pattern: func (o *Obj) requestMany(urls []string) []Result { wg := new(sync.WaitGroup) wg.Add(len(urls)) res := make([]Result, len(urls)) for i := range urls { i := i go func() { res[i] = o.requestOne(urls[i]) wg.Done() }() } wg.Wait() return res } Currently o, wg, res, and i are captured by reference causing 3+len(urls) allocations (e.g. PPARAM o is promoted to PPARAMREF and moved to heap). But all of them can be captured by value without changing behavior. This change implements simple strategy for capturing by value: if a captured variable is not addrtaken and never assigned to, then it is captured by value (it is effectively const). This simple strategy turned out to be very effective: ~80% of all captures in std lib are turned into value captures. The remaining 20% are mostly in defers and non-escaping closures, that is, they do not cause allocations anyway. benchmark old allocs new allocs delta BenchmarkCompressedZipGarbage 153 126 -17.65% BenchmarkEncodeDigitsSpeed1e4 91 69 -24.18% BenchmarkEncodeDigitsSpeed1e5 178 129 -27.53% BenchmarkEncodeDigitsSpeed1e6 1510 1051 -30.40% BenchmarkEncodeDigitsDefault1e4 100 75 -25.00% BenchmarkEncodeDigitsDefault1e5 193 139 -27.98% BenchmarkEncodeDigitsDefault1e6 1420 985 -30.63% BenchmarkEncodeDigitsCompress1e4 100 75 -25.00% BenchmarkEncodeDigitsCompress1e5 193 139 -27.98% BenchmarkEncodeDigitsCompress1e6 1420 985 -30.63% BenchmarkEncodeTwainSpeed1e4 109 81 -25.69% BenchmarkEncodeTwainSpeed1e5 211 151 -28.44% BenchmarkEncodeTwainSpeed1e6 1588 1097 -30.92% BenchmarkEncodeTwainDefault1e4 103 77 -25.24% BenchmarkEncodeTwainDefault1e5 199 143 -28.14% BenchmarkEncodeTwainDefault1e6 1324 917 -30.74% BenchmarkEncodeTwainCompress1e4 103 77 -25.24% BenchmarkEncodeTwainCompress1e5 190 137 -27.89% BenchmarkEncodeTwainCompress1e6 1327 919 -30.75% BenchmarkConcurrentDBExec 16223 16220 -0.02% BenchmarkConcurrentStmtQuery 17687 16182 -8.51% BenchmarkConcurrentStmtExec 5191 5186 -0.10% BenchmarkConcurrentTxQuery 17665 17661 -0.02% BenchmarkConcurrentTxExec 15154 15150 -0.03% BenchmarkConcurrentTxStmtQuery 17661 16157 -8.52% BenchmarkConcurrentTxStmtExec 3677 3673 -0.11% BenchmarkConcurrentRandom 14000 13614 -2.76% BenchmarkManyConcurrentQueries 25 22 -12.00% BenchmarkDecodeComplex128Slice 318 252 -20.75% BenchmarkDecodeFloat64Slice 318 252 -20.75% BenchmarkDecodeInt32Slice 318 252 -20.75% BenchmarkDecodeStringSlice 2318 2252 -2.85% BenchmarkDecode 11 8 -27.27% BenchmarkEncodeGray 64 56 -12.50% BenchmarkEncodeNRGBOpaque 64 56 -12.50% BenchmarkEncodeNRGBA 67 58 -13.43% BenchmarkEncodePaletted 68 60 -11.76% BenchmarkEncodeRGBOpaque 64 56 -12.50% BenchmarkGoLookupIP 153 139 -9.15% BenchmarkGoLookupIPNoSuchHost 508 466 -8.27% BenchmarkGoLookupIPWithBrokenNameServer 245 226 -7.76% BenchmarkClientServer 62 59 -4.84% BenchmarkClientServerParallel4 62 59 -4.84% BenchmarkClientServerParallel64 62 59 -4.84% BenchmarkClientServerParallelTLS4 79 76 -3.80% BenchmarkClientServerParallelTLS64 112 109 -2.68% BenchmarkCreateGoroutinesCapture 10 6 -40.00% BenchmarkAfterFunc 1006 1005 -0.10% Fixes #6632. Change-Id: I0cd51e4d356331d7f3c5f447669080cd19b0d2ca Reviewed-on: https://go-review.googlesource.com/3166 Reviewed-by: Russ Cox <rsc@golang.org>
2015-01-19 22:59:58 +03:00
"sync"
"sync/atomic"
"syscall"
"testing"
"time"
)
var stop = make(chan bool, 1)
func perpetuumMobile() {
select {
case <-stop:
default:
go perpetuumMobile()
}
}
func TestStopTheWorldDeadlock(t *testing.T) {
if runtime.GOARCH == "wasm" {
t.Skip("no preemption on wasm yet")
}
if testing.Short() {
t.Skip("skipping during short test")
}
maxprocs := runtime.GOMAXPROCS(3)
compl := make(chan bool, 2)
go func() {
for i := 0; i != 1000; i += 1 {
runtime.GC()
}
compl <- true
}()
go func() {
for i := 0; i != 1000; i += 1 {
runtime.GOMAXPROCS(3)
}
compl <- true
}()
go perpetuumMobile()
<-compl
<-compl
stop <- true
runtime.GOMAXPROCS(maxprocs)
}
func TestYieldProgress(t *testing.T) {
testYieldProgress(false)
}
func TestYieldLockedProgress(t *testing.T) {
testYieldProgress(true)
}
func testYieldProgress(locked bool) {
c := make(chan bool)
cack := make(chan bool)
go func() {
if locked {
runtime.LockOSThread()
}
for {
select {
case <-c:
cack <- true
return
default:
runtime.Gosched()
}
}
}()
time.Sleep(10 * time.Millisecond)
c <- true
<-cack
}
func TestYieldLocked(t *testing.T) {
const N = 10
c := make(chan bool)
go func() {
runtime.LockOSThread()
for i := 0; i < N; i++ {
runtime.Gosched()
time.Sleep(time.Millisecond)
}
c <- true
// runtime.UnlockOSThread() is deliberately omitted
}()
<-c
}
func TestGoroutineParallelism(t *testing.T) {
if runtime.NumCPU() == 1 {
// Takes too long, too easy to deadlock, etc.
t.Skip("skipping on uniprocessor")
}
P := 4
N := 10
if testing.Short() {
P = 3
N = 3
}
defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(P))
// If runtime triggers a forced GC during this test then it will deadlock,
// since the goroutines can't be stopped/preempted.
// Disable GC for this test (see issue #10958).
defer debug.SetGCPercent(debug.SetGCPercent(-1))
for try := 0; try < N; try++ {
done := make(chan bool)
x := uint32(0)
for p := 0; p < P; p++ {
// Test that all P goroutines are scheduled at the same time
go func(p int) {
for i := 0; i < 3; i++ {
expected := uint32(P*i + p)
for atomic.LoadUint32(&x) != expected {
}
atomic.StoreUint32(&x, expected+1)
}
done <- true
}(p)
}
for p := 0; p < P; p++ {
<-done
}
}
}
// Test that all runnable goroutines are scheduled at the same time.
func TestGoroutineParallelism2(t *testing.T) {
//testGoroutineParallelism2(t, false, false)
testGoroutineParallelism2(t, true, false)
testGoroutineParallelism2(t, false, true)
testGoroutineParallelism2(t, true, true)
}
func testGoroutineParallelism2(t *testing.T, load, netpoll bool) {
if runtime.NumCPU() == 1 {
// Takes too long, too easy to deadlock, etc.
t.Skip("skipping on uniprocessor")
}
P := 4
N := 10
if testing.Short() {
N = 3
}
defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(P))
// If runtime triggers a forced GC during this test then it will deadlock,
// since the goroutines can't be stopped/preempted.
// Disable GC for this test (see issue #10958).
defer debug.SetGCPercent(debug.SetGCPercent(-1))
for try := 0; try < N; try++ {
if load {
// Create P goroutines and wait until they all run.
// When we run the actual test below, worker threads
// running the goroutines will start parking.
done := make(chan bool)
x := uint32(0)
for p := 0; p < P; p++ {
go func() {
if atomic.AddUint32(&x, 1) == uint32(P) {
done <- true
return
}
for atomic.LoadUint32(&x) != uint32(P) {
}
}()
}
<-done
}
if netpoll {
// Enable netpoller, affects schedler behavior.
laddr := "localhost:0"
if runtime.GOOS == "android" {
// On some Android devices, there are no records for localhost,
// see https://golang.org/issues/14486.
// Don't use 127.0.0.1 for every case, it won't work on IPv6-only systems.
laddr = "127.0.0.1:0"
}
ln, err := net.Listen("tcp", laddr)
if err != nil {
defer ln.Close() // yup, defer in a loop
}
}
done := make(chan bool)
x := uint32(0)
// Spawn P goroutines in a nested fashion just to differ from TestGoroutineParallelism.
for p := 0; p < P/2; p++ {
go func(p int) {
for p2 := 0; p2 < 2; p2++ {
go func(p2 int) {
for i := 0; i < 3; i++ {
expected := uint32(P*i + p*2 + p2)
for atomic.LoadUint32(&x) != expected {
}
atomic.StoreUint32(&x, expected+1)
}
done <- true
}(p2)
}
}(p)
}
for p := 0; p < P; p++ {
<-done
}
}
}
func TestBlockLocked(t *testing.T) {
const N = 10
c := make(chan bool)
go func() {
runtime.LockOSThread()
for i := 0; i < N; i++ {
c <- true
}
runtime.UnlockOSThread()
}()
for i := 0; i < N; i++ {
<-c
}
}
func TestTimerFairness(t *testing.T) {
if runtime.GOARCH == "wasm" {
t.Skip("no preemption on wasm yet")
}
done := make(chan bool)
c := make(chan bool)
for i := 0; i < 2; i++ {
go func() {
for {
select {
case c <- true:
case <-done:
return
}
}
}()
}
timer := time.After(20 * time.Millisecond)
for {
select {
case <-c:
case <-timer:
close(done)
return
}
}
}
func TestTimerFairness2(t *testing.T) {
if runtime.GOARCH == "wasm" {
t.Skip("no preemption on wasm yet")
}
done := make(chan bool)
c := make(chan bool)
for i := 0; i < 2; i++ {
go func() {
timer := time.After(20 * time.Millisecond)
var buf [1]byte
for {
syscall.Read(0, buf[0:0])
select {
case c <- true:
case <-c:
case <-timer:
done <- true
return
}
}
}()
}
<-done
<-done
}
// The function is used to test preemption at split stack checks.
// Declaring a var avoids inlining at the call site.
var preempt = func() int {
var a [128]int
sum := 0
for _, v := range a {
sum += v
}
return sum
}
func TestPreemption(t *testing.T) {
if runtime.GOARCH == "wasm" {
t.Skip("no preemption on wasm yet")
}
// Test that goroutines are preempted at function calls.
N := 5
if testing.Short() {
N = 2
}
c := make(chan bool)
var x uint32
for g := 0; g < 2; g++ {
go func(g int) {
for i := 0; i < N; i++ {
for atomic.LoadUint32(&x) != uint32(g) {
preempt()
}
atomic.StoreUint32(&x, uint32(1-g))
}
c <- true
}(g)
}
<-c
<-c
}
func TestPreemptionGC(t *testing.T) {
if runtime.GOARCH == "wasm" {
t.Skip("no preemption on wasm yet")
}
// Test that pending GC preempts running goroutines.
P := 5
N := 10
if testing.Short() {
P = 3
N = 2
}
defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(P + 1))
var stop uint32
for i := 0; i < P; i++ {
go func() {
for atomic.LoadUint32(&stop) == 0 {
preempt()
}
}()
}
for i := 0; i < N; i++ {
runtime.Gosched()
runtime.GC()
}
atomic.StoreUint32(&stop, 1)
}
func TestAsyncPreempt(t *testing.T) {
if !runtime.PreemptMSupported {
t.Skip("asynchronous preemption not supported on this platform")
}
output := runTestProg(t, "testprog", "AsyncPreempt")
want := "OK\n"
if output != want {
t.Fatalf("want %s, got %s\n", want, output)
}
}
func TestGCFairness(t *testing.T) {
output := runTestProg(t, "testprog", "GCFairness")
want := "OK\n"
if output != want {
t.Fatalf("want %s, got %s\n", want, output)
}
}
runtime: fix goroutine priority elevation Currently it's possible for user code to exploit the high scheduler priority of the GC worker in conjunction with the runnext optimization to elevate a user goroutine to high priority so it will always run even if there are other runnable goroutines. For example, if a goroutine is in a tight allocation loop, the following can happen: 1. Goroutine 1 allocates, triggering a GC. 2. G 1 attempts an assist, but fails and blocks. 3. The scheduler runs the GC worker, since it is high priority. Note that this also starts a new scheduler quantum. 4. The GC worker does enough work to satisfy the assist. 5. The GC worker readies G 1, putting it in runnext. 6. GC finishes and the scheduler runs G 1 from runnext, giving it the rest of the GC worker's quantum. 7. Go to 1. Even if there are other goroutines on the run queue, they never get a chance to run in the above sequence. This requires a confluence of circumstances that make it unlikely, though not impossible, that it would happen in "real" code. In the test added by this commit, we force this confluence by setting GOMAXPROCS to 1 and GOGC to 1 so it's easy for the test to repeated trigger GC and wake from a blocked assist. We fix this by making GC always put user goroutines at the end of the run queue, instead of in runnext. This makes it so user code can't piggy-back on the GC's high priority to make a user goroutine act like it has high priority. The only other situation where GC wakes user goroutines is waking all blocked assists at the end, but this uses the global run queue and hence doesn't have this problem. Fixes #15706. Change-Id: I1589dee4b7b7d0c9c8575ed3472226084dfce8bc Reviewed-on: https://go-review.googlesource.com/23172 Reviewed-by: Rick Hudson <rlh@golang.org>
2016-05-17 18:46:03 -04:00
func TestGCFairness2(t *testing.T) {
output := runTestProg(t, "testprog", "GCFairness2")
want := "OK\n"
if output != want {
t.Fatalf("want %s, got %s\n", want, output)
}
}
func TestNumGoroutine(t *testing.T) {
output := runTestProg(t, "testprog", "NumGoroutine")
want := "1\n"
if output != want {
t.Fatalf("want %q, got %q", want, output)
}
buf := make([]byte, 1<<20)
// Try up to 10 times for a match before giving up.
// This is a fundamentally racy check but it's important
// to notice if NumGoroutine and Stack are _always_ out of sync.
for i := 0; ; i++ {
// Give goroutines about to exit a chance to exit.
// The NumGoroutine and Stack below need to see
// the same state of the world, so anything we can do
// to keep it quiet is good.
runtime.Gosched()
n := runtime.NumGoroutine()
buf = buf[:runtime.Stack(buf, true)]
nstk := strings.Count(string(buf), "goroutine ")
if n == nstk {
break
}
if i >= 10 {
t.Fatalf("NumGoroutine=%d, but found %d goroutines in stack dump: %s", n, nstk, buf)
}
}
}
runtime: yield time slice to most recently readied G Currently, when the runtime ready()s a G, it adds it to the end of the current P's run queue and continues running. If there are many other things in the run queue, this can result in a significant delay before the ready()d G actually runs and can hurt fairness when other Gs in the run queue are CPU hogs. For example, if there are three Gs sharing a P, one of which is a CPU hog that never voluntarily gives up the P and the other two of which are doing small amounts of work and communicating back and forth on an unbuffered channel, the two communicating Gs will get very little CPU time. Change this so that when G1 ready()s G2 and then blocks, the scheduler immediately hands off the remainder of G1's time slice to G2. In the above example, the two communicating Gs will now act as a unit and together get half of the CPU time, while the CPU hog gets the other half of the CPU time. This fixes the problem demonstrated by the ping-pong benchmark added in the previous commit: benchmark old ns/op new ns/op delta BenchmarkPingPongHog 684287 825 -99.88% On the x/benchmarks suite, this change improves the performance of garbage by ~6% (for GOMAXPROCS=1 and 4), and json by 28% and 36% for GOMAXPROCS=1 and 4. It has negligible effect on heap size. This has no effect on the go1 benchmark suite since those benchmarks are mostly single-threaded. Change-Id: I858a08eaa78f702ea98a5fac99d28a4ac91d339f Reviewed-on: https://go-review.googlesource.com/9289 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-22 14:42:26 -04:00
func TestPingPongHog(t *testing.T) {
if runtime.GOARCH == "wasm" {
t.Skip("no preemption on wasm yet")
}
runtime: yield time slice to most recently readied G Currently, when the runtime ready()s a G, it adds it to the end of the current P's run queue and continues running. If there are many other things in the run queue, this can result in a significant delay before the ready()d G actually runs and can hurt fairness when other Gs in the run queue are CPU hogs. For example, if there are three Gs sharing a P, one of which is a CPU hog that never voluntarily gives up the P and the other two of which are doing small amounts of work and communicating back and forth on an unbuffered channel, the two communicating Gs will get very little CPU time. Change this so that when G1 ready()s G2 and then blocks, the scheduler immediately hands off the remainder of G1's time slice to G2. In the above example, the two communicating Gs will now act as a unit and together get half of the CPU time, while the CPU hog gets the other half of the CPU time. This fixes the problem demonstrated by the ping-pong benchmark added in the previous commit: benchmark old ns/op new ns/op delta BenchmarkPingPongHog 684287 825 -99.88% On the x/benchmarks suite, this change improves the performance of garbage by ~6% (for GOMAXPROCS=1 and 4), and json by 28% and 36% for GOMAXPROCS=1 and 4. It has negligible effect on heap size. This has no effect on the go1 benchmark suite since those benchmarks are mostly single-threaded. Change-Id: I858a08eaa78f702ea98a5fac99d28a4ac91d339f Reviewed-on: https://go-review.googlesource.com/9289 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-22 14:42:26 -04:00
if testing.Short() {
t.Skip("skipping in -short mode")
}
if race.Enabled {
// The race detector randomizes the scheduler,
// which causes this test to fail (#38266).
t.Skip("skipping in -race mode")
}
runtime: yield time slice to most recently readied G Currently, when the runtime ready()s a G, it adds it to the end of the current P's run queue and continues running. If there are many other things in the run queue, this can result in a significant delay before the ready()d G actually runs and can hurt fairness when other Gs in the run queue are CPU hogs. For example, if there are three Gs sharing a P, one of which is a CPU hog that never voluntarily gives up the P and the other two of which are doing small amounts of work and communicating back and forth on an unbuffered channel, the two communicating Gs will get very little CPU time. Change this so that when G1 ready()s G2 and then blocks, the scheduler immediately hands off the remainder of G1's time slice to G2. In the above example, the two communicating Gs will now act as a unit and together get half of the CPU time, while the CPU hog gets the other half of the CPU time. This fixes the problem demonstrated by the ping-pong benchmark added in the previous commit: benchmark old ns/op new ns/op delta BenchmarkPingPongHog 684287 825 -99.88% On the x/benchmarks suite, this change improves the performance of garbage by ~6% (for GOMAXPROCS=1 and 4), and json by 28% and 36% for GOMAXPROCS=1 and 4. It has negligible effect on heap size. This has no effect on the go1 benchmark suite since those benchmarks are mostly single-threaded. Change-Id: I858a08eaa78f702ea98a5fac99d28a4ac91d339f Reviewed-on: https://go-review.googlesource.com/9289 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-22 14:42:26 -04:00
defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
done := make(chan bool)
hogChan, lightChan := make(chan bool), make(chan bool)
hogCount, lightCount := 0, 0
run := func(limit int, counter *int, wake chan bool) {
for {
select {
case <-done:
return
case <-wake:
for i := 0; i < limit; i++ {
*counter++
}
wake <- true
}
}
}
// Start two co-scheduled hog goroutines.
for i := 0; i < 2; i++ {
go run(1e6, &hogCount, hogChan)
}
// Start two co-scheduled light goroutines.
for i := 0; i < 2; i++ {
go run(1e3, &lightCount, lightChan)
}
// Start goroutine pairs and wait for a few preemption rounds.
hogChan <- true
lightChan <- true
time.Sleep(100 * time.Millisecond)
close(done)
<-hogChan
<-lightChan
// Check that hogCount and lightCount are within a factor of
// 5, which indicates that both pairs of goroutines handed off
// the P within a time-slice to their buddy. We can use a
// fairly large factor here to make this robust: if the
// scheduler isn't working right, the gap should be ~1000X.
const factor = 5
if hogCount > lightCount*factor || lightCount > hogCount*factor {
t.Fatalf("want hogCount/lightCount in [%v, %v]; got %d/%d = %g", 1.0/factor, factor, hogCount, lightCount, float64(hogCount)/float64(lightCount))
runtime: yield time slice to most recently readied G Currently, when the runtime ready()s a G, it adds it to the end of the current P's run queue and continues running. If there are many other things in the run queue, this can result in a significant delay before the ready()d G actually runs and can hurt fairness when other Gs in the run queue are CPU hogs. For example, if there are three Gs sharing a P, one of which is a CPU hog that never voluntarily gives up the P and the other two of which are doing small amounts of work and communicating back and forth on an unbuffered channel, the two communicating Gs will get very little CPU time. Change this so that when G1 ready()s G2 and then blocks, the scheduler immediately hands off the remainder of G1's time slice to G2. In the above example, the two communicating Gs will now act as a unit and together get half of the CPU time, while the CPU hog gets the other half of the CPU time. This fixes the problem demonstrated by the ping-pong benchmark added in the previous commit: benchmark old ns/op new ns/op delta BenchmarkPingPongHog 684287 825 -99.88% On the x/benchmarks suite, this change improves the performance of garbage by ~6% (for GOMAXPROCS=1 and 4), and json by 28% and 36% for GOMAXPROCS=1 and 4. It has negligible effect on heap size. This has no effect on the go1 benchmark suite since those benchmarks are mostly single-threaded. Change-Id: I858a08eaa78f702ea98a5fac99d28a4ac91d339f Reviewed-on: https://go-review.googlesource.com/9289 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-22 14:42:26 -04:00
}
}
func BenchmarkPingPongHog(b *testing.B) {
if b.N == 0 {
return
}
defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
// Create a CPU hog
stop, done := make(chan bool), make(chan bool)
go func() {
for {
select {
case <-stop:
done <- true
return
default:
}
}
}()
// Ping-pong b.N times
ping, pong := make(chan bool), make(chan bool)
go func() {
for j := 0; j < b.N; j++ {
pong <- <-ping
}
close(stop)
done <- true
}()
go func() {
for i := 0; i < b.N; i++ {
ping <- <-pong
}
done <- true
}()
b.ResetTimer()
ping <- true // Start ping-pong
<-stop
b.StopTimer()
<-ping // Let last ponger exit
<-done // Make sure goroutines exit
<-done
<-done
}
runtime: disable stack shrinking in activeStackChans race window Currently activeStackChans is set before a goroutine blocks on a channel operation in an unlockf passed to gopark. The trouble is that the unlockf is called *after* the G's status is changed, and the G's status is what is used by a concurrent mark worker (calling suspendG) to determine that a G has successfully been suspended. In this window between the status change and unlockf, the mark worker could try to shrink the G's stack, and in particular observe that activeStackChans is false. This observation will cause the mark worker to *not* synchronize with concurrent channel operations when it should, and so updating pointers in the sudog for the blocked goroutine (which may point to the goroutine's stack) races with channel operations which may also manipulate the pointer (read it, dereference it, update it, etc.). Fix the problem by adding a new atomically-updated flag to the g struct called parkingOnChan, which is non-zero in the race window above. Then, in isShrinkStackSafe, check if parkingOnChan is zero. The race is resolved like so: * Blocking G sets parkingOnChan, then changes status in gopark. * Mark worker successfully suspends blocking G. * If the mark worker observes parkingOnChan is non-zero when checking isShrinkStackSafe, then it's not safe to shrink (we're in the race window). * If the mark worker observes parkingOnChan as zero, then because the mark worker observed the G status change, it can be sure that gopark's unlockf completed, and gp.activeStackChans will be correct. The risk of this change is low, since although it reduces the number of places that stack shrinking is allowed, the window here is incredibly small. Essentially, every place that it might crash now is replaced with no shrink. This change adds a test, but the race window is so small that it's hard to trigger without a well-placed sleep in park_m. Also, this change fixes stackGrowRecursive in proc_test.go to actually allocate a 128-byte stack frame. It turns out the compiler was destructuring the "pad" field and only allocating one uint64 on the stack. Fixes #40641. Change-Id: I7dfbe7d460f6972b8956116b137bc13bc24464e8 Reviewed-on: https://go-review.googlesource.com/c/go/+/247050 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Michael Pratt <mpratt@google.com> Trust: Michael Knyszek <mknyszek@google.com>
2020-08-10 20:02:22 +00:00
var padData [128]uint64
func stackGrowthRecursive(i int) {
var pad [128]uint64
runtime: disable stack shrinking in activeStackChans race window Currently activeStackChans is set before a goroutine blocks on a channel operation in an unlockf passed to gopark. The trouble is that the unlockf is called *after* the G's status is changed, and the G's status is what is used by a concurrent mark worker (calling suspendG) to determine that a G has successfully been suspended. In this window between the status change and unlockf, the mark worker could try to shrink the G's stack, and in particular observe that activeStackChans is false. This observation will cause the mark worker to *not* synchronize with concurrent channel operations when it should, and so updating pointers in the sudog for the blocked goroutine (which may point to the goroutine's stack) races with channel operations which may also manipulate the pointer (read it, dereference it, update it, etc.). Fix the problem by adding a new atomically-updated flag to the g struct called parkingOnChan, which is non-zero in the race window above. Then, in isShrinkStackSafe, check if parkingOnChan is zero. The race is resolved like so: * Blocking G sets parkingOnChan, then changes status in gopark. * Mark worker successfully suspends blocking G. * If the mark worker observes parkingOnChan is non-zero when checking isShrinkStackSafe, then it's not safe to shrink (we're in the race window). * If the mark worker observes parkingOnChan as zero, then because the mark worker observed the G status change, it can be sure that gopark's unlockf completed, and gp.activeStackChans will be correct. The risk of this change is low, since although it reduces the number of places that stack shrinking is allowed, the window here is incredibly small. Essentially, every place that it might crash now is replaced with no shrink. This change adds a test, but the race window is so small that it's hard to trigger without a well-placed sleep in park_m. Also, this change fixes stackGrowRecursive in proc_test.go to actually allocate a 128-byte stack frame. It turns out the compiler was destructuring the "pad" field and only allocating one uint64 on the stack. Fixes #40641. Change-Id: I7dfbe7d460f6972b8956116b137bc13bc24464e8 Reviewed-on: https://go-review.googlesource.com/c/go/+/247050 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Michael Pratt <mpratt@google.com> Trust: Michael Knyszek <mknyszek@google.com>
2020-08-10 20:02:22 +00:00
pad = padData
for j := range pad {
if pad[j] != 0 {
return
}
}
if i != 0 {
stackGrowthRecursive(i - 1)
}
}
func TestPreemptSplitBig(t *testing.T) {
if testing.Short() {
t.Skip("skipping in -short mode")
}
defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
stop := make(chan int)
go big(stop)
for i := 0; i < 3; i++ {
time.Sleep(10 * time.Microsecond) // let big start running
runtime.GC()
}
close(stop)
}
func big(stop chan int) int {
n := 0
for {
// delay so that gc is sure to have asked for a preemption
for i := 0; i < 1e9; i++ {
n++
}
// call bigframe, which used to miss the preemption in its prologue.
bigframe(stop)
// check if we've been asked to stop.
select {
case <-stop:
return n
}
}
}
func bigframe(stop chan int) int {
// not splitting the stack will overflow.
// small will notice that it needs a stack split and will
// catch the overflow.
var x [8192]byte
return small(stop, &x)
}
func small(stop chan int, x *[8192]byte) int {
for i := range x {
x[i] = byte(i)
}
sum := 0
for i := range x {
sum += int(x[i])
}
// keep small from being a leaf function, which might
// make it not do any stack check at all.
nonleaf(stop)
return sum
}
func nonleaf(stop chan int) bool {
// do something that won't be inlined:
select {
case <-stop:
return true
default:
return false
}
}
func TestSchedLocalQueue(t *testing.T) {
runtime.RunSchedLocalQueueTest()
}
func TestSchedLocalQueueSteal(t *testing.T) {
runtime.RunSchedLocalQueueStealTest()
}
func TestSchedLocalQueueEmpty(t *testing.T) {
if runtime.NumCPU() == 1 {
// Takes too long and does not trigger the race.
t.Skip("skipping on uniprocessor")
}
defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(4))
// If runtime triggers a forced GC during this test then it will deadlock,
// since the goroutines can't be stopped/preempted during spin wait.
defer debug.SetGCPercent(debug.SetGCPercent(-1))
iters := int(1e5)
if testing.Short() {
iters = 1e2
}
runtime.RunSchedLocalQueueEmptyTest(iters)
}
func benchmarkStackGrowth(b *testing.B, rec int) {
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
stackGrowthRecursive(rec)
}
})
}
func BenchmarkStackGrowth(b *testing.B) {
benchmarkStackGrowth(b, 10)
}
func BenchmarkStackGrowthDeep(b *testing.B) {
benchmarkStackGrowth(b, 1024)
}
func BenchmarkCreateGoroutines(b *testing.B) {
benchmarkCreateGoroutines(b, 1)
}
func BenchmarkCreateGoroutinesParallel(b *testing.B) {
benchmarkCreateGoroutines(b, runtime.GOMAXPROCS(-1))
}
func benchmarkCreateGoroutines(b *testing.B, procs int) {
c := make(chan bool)
var f func(n int)
f = func(n int) {
if n == 0 {
c <- true
return
}
go f(n - 1)
}
for i := 0; i < procs; i++ {
go f(b.N / procs)
}
for i := 0; i < procs; i++ {
<-c
}
}
cmd/gc: capture variables by value Language specification says that variables are captured by reference. And that is what gc compiler does. However, in lots of cases it is possible to capture variables by value under the hood without affecting visible behavior of programs. For example, consider the following typical pattern: func (o *Obj) requestMany(urls []string) []Result { wg := new(sync.WaitGroup) wg.Add(len(urls)) res := make([]Result, len(urls)) for i := range urls { i := i go func() { res[i] = o.requestOne(urls[i]) wg.Done() }() } wg.Wait() return res } Currently o, wg, res, and i are captured by reference causing 3+len(urls) allocations (e.g. PPARAM o is promoted to PPARAMREF and moved to heap). But all of them can be captured by value without changing behavior. This change implements simple strategy for capturing by value: if a captured variable is not addrtaken and never assigned to, then it is captured by value (it is effectively const). This simple strategy turned out to be very effective: ~80% of all captures in std lib are turned into value captures. The remaining 20% are mostly in defers and non-escaping closures, that is, they do not cause allocations anyway. benchmark old allocs new allocs delta BenchmarkCompressedZipGarbage 153 126 -17.65% BenchmarkEncodeDigitsSpeed1e4 91 69 -24.18% BenchmarkEncodeDigitsSpeed1e5 178 129 -27.53% BenchmarkEncodeDigitsSpeed1e6 1510 1051 -30.40% BenchmarkEncodeDigitsDefault1e4 100 75 -25.00% BenchmarkEncodeDigitsDefault1e5 193 139 -27.98% BenchmarkEncodeDigitsDefault1e6 1420 985 -30.63% BenchmarkEncodeDigitsCompress1e4 100 75 -25.00% BenchmarkEncodeDigitsCompress1e5 193 139 -27.98% BenchmarkEncodeDigitsCompress1e6 1420 985 -30.63% BenchmarkEncodeTwainSpeed1e4 109 81 -25.69% BenchmarkEncodeTwainSpeed1e5 211 151 -28.44% BenchmarkEncodeTwainSpeed1e6 1588 1097 -30.92% BenchmarkEncodeTwainDefault1e4 103 77 -25.24% BenchmarkEncodeTwainDefault1e5 199 143 -28.14% BenchmarkEncodeTwainDefault1e6 1324 917 -30.74% BenchmarkEncodeTwainCompress1e4 103 77 -25.24% BenchmarkEncodeTwainCompress1e5 190 137 -27.89% BenchmarkEncodeTwainCompress1e6 1327 919 -30.75% BenchmarkConcurrentDBExec 16223 16220 -0.02% BenchmarkConcurrentStmtQuery 17687 16182 -8.51% BenchmarkConcurrentStmtExec 5191 5186 -0.10% BenchmarkConcurrentTxQuery 17665 17661 -0.02% BenchmarkConcurrentTxExec 15154 15150 -0.03% BenchmarkConcurrentTxStmtQuery 17661 16157 -8.52% BenchmarkConcurrentTxStmtExec 3677 3673 -0.11% BenchmarkConcurrentRandom 14000 13614 -2.76% BenchmarkManyConcurrentQueries 25 22 -12.00% BenchmarkDecodeComplex128Slice 318 252 -20.75% BenchmarkDecodeFloat64Slice 318 252 -20.75% BenchmarkDecodeInt32Slice 318 252 -20.75% BenchmarkDecodeStringSlice 2318 2252 -2.85% BenchmarkDecode 11 8 -27.27% BenchmarkEncodeGray 64 56 -12.50% BenchmarkEncodeNRGBOpaque 64 56 -12.50% BenchmarkEncodeNRGBA 67 58 -13.43% BenchmarkEncodePaletted 68 60 -11.76% BenchmarkEncodeRGBOpaque 64 56 -12.50% BenchmarkGoLookupIP 153 139 -9.15% BenchmarkGoLookupIPNoSuchHost 508 466 -8.27% BenchmarkGoLookupIPWithBrokenNameServer 245 226 -7.76% BenchmarkClientServer 62 59 -4.84% BenchmarkClientServerParallel4 62 59 -4.84% BenchmarkClientServerParallel64 62 59 -4.84% BenchmarkClientServerParallelTLS4 79 76 -3.80% BenchmarkClientServerParallelTLS64 112 109 -2.68% BenchmarkCreateGoroutinesCapture 10 6 -40.00% BenchmarkAfterFunc 1006 1005 -0.10% Fixes #6632. Change-Id: I0cd51e4d356331d7f3c5f447669080cd19b0d2ca Reviewed-on: https://go-review.googlesource.com/3166 Reviewed-by: Russ Cox <rsc@golang.org>
2015-01-19 22:59:58 +03:00
func BenchmarkCreateGoroutinesCapture(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
const N = 4
var wg sync.WaitGroup
wg.Add(N)
for i := 0; i < N; i++ {
i := i
go func() {
if i >= N {
b.Logf("bad") // just to capture b
}
wg.Done()
}()
}
wg.Wait()
}
}
runtime: skip work recheck for non-spinning Ms When an M transitions from spinning to non-spinning state, it must recheck most sources of work to avoid missing work submitted between its initial check and decrementing sched.nmspinning (see "delicate dance" comment). Ever since the scheduler rewrite in Go 1.1 (golang.org/cl/7314062), we have performed this recheck on all Ms before stopping, regardless of whether or not they were spinning. Unfortunately, there is a problem with this approach: non-spinning Ms are not eligible to steal work (note the skip over the stealWork block), but can detect work during the recheck. If there is work available, this non-spinning M will jump to top, skip stealing, land in recheck again, and repeat. i.e., it will spin uselessly. The spin is bounded. This can only occur if there is another spinning M, which will either take the work, allowing this M to stop, or take some other work, allowing this M to upgrade to spinning. But the spinning is ultimately just a fancy spin-wait. golang.org/issue/43997 discusses several ways to address this. This CL takes the simplest approach: skipping the recheck on non-spinning Ms and allowing them to go to stop. Results for scheduler-relevant runtime and time benchmarks can be found at https://perf.golang.org/search?q=upload:20210420.5. The new BenchmarkCreateGoroutinesSingle is a characteristic example workload that hits this issue hard. A single M readies lots of work without itself parking. Other Ms must spin to steal work, which is very short-lived, forcing those Ms to spin again. Some of the Ms will be non-spinning and hit the above bug. With this fixed, that benchmark drops in CPU usage by a massive 68%, and wall time 24%. BenchmarkNetpollBreak shows similar drops because it is unintentionally almost the same benchmark (create short-living Gs in a loop). Typical well-behaved programs show little change. We also measure scheduling latency (time from goready to execute). Note that many of these benchmarks are very noisy because they don't involve much scheduling. Those that do, like CreateGoroutinesSingle, are expected to increase as we are replacing unintentional spin waiting with a real park. Fixes #43997 Change-Id: Ie1d1e1800f393cee1792455412caaa5865d13562 Reviewed-on: https://go-review.googlesource.com/c/go/+/310850 Trust: Michael Pratt <mpratt@google.com> Run-TryBot: Michael Pratt <mpratt@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2021-04-07 12:01:44 -04:00
// warmupScheduler ensures the scheduler has at least targetThreadCount threads
// in its thread pool.
func warmupScheduler(targetThreadCount int) {
var wg sync.WaitGroup
var count int32
for i := 0; i < targetThreadCount; i++ {
wg.Add(1)
go func() {
atomic.AddInt32(&count, 1)
for atomic.LoadInt32(&count) < int32(targetThreadCount) {
// spin until all threads started
}
// spin a bit more to ensure they are all running on separate CPUs.
doWork(time.Millisecond)
wg.Done()
}()
}
wg.Wait()
}
func doWork(dur time.Duration) {
start := time.Now()
for time.Since(start) < dur {
}
}
// BenchmarkCreateGoroutinesSingle creates many goroutines, all from a single
// producer (the main benchmark goroutine).
//
// Compared to BenchmarkCreateGoroutines, this causes different behavior in the
// scheduler because Ms are much more likely to need to steal work from the
// main P rather than having work in the local run queue.
func BenchmarkCreateGoroutinesSingle(b *testing.B) {
// Since we are interested in stealing behavior, warm the scheduler to
// get all the Ps running first.
warmupScheduler(runtime.GOMAXPROCS(0))
b.ResetTimer()
var wg sync.WaitGroup
wg.Add(b.N)
for i := 0; i < b.N; i++ {
go func(){
wg.Done()
}()
}
wg.Wait()
}
cmd/gc: transform closure calls to function calls Currently we always create context objects for closures that capture variables. However, it is completely unnecessary for direct calls of closures (whether it is func()(), defer func()() or go func()()). This change transforms any OCALLFUNC(OCLOSURE) to normal function call. Closed variables become function arguments. This transformation is especially beneficial for go func(), because we do not need to allocate context object on heap. But it makes direct closure calls a bit faster as well (see BenchmarkClosureCall). On implementation level it required to introduce yet another compiler pass. However, the pass iterates only over xtop, so it should not be an issue. Transformation consists of two parts: closure transformation and call site transformation. We can't run these parts on different sides of escape analysis, because tree state is inconsistent. We can do both parts during typecheck, we don't know how to capture variables and don't have call site. We can't do both parts during walk of OCALLFUNC, because we can walk OCLOSURE body earlier. So now capturevars pass only decides how to capture variables (this info is required for escape analysis). New transformclosure pass, that runs just before order/walk, does all transformations of a closure. And later walk of OCALLFUNC(OCLOSURE) transforms call site. benchmark old ns/op new ns/op delta BenchmarkClosureCall 4.89 3.09 -36.81% BenchmarkCreateGoroutinesCapture 1634 1294 -20.81% benchmark old allocs new allocs delta BenchmarkCreateGoroutinesCapture 6 2 -66.67% benchmark old bytes new bytes delta BenchmarkCreateGoroutinesCapture 176 48 -72.73% Change-Id: Ic85e1706e18c3235cc45b3c0c031a9c1cdb7a40e Reviewed-on: https://go-review.googlesource.com/4050 Reviewed-by: Russ Cox <rsc@golang.org>
2015-02-06 15:09:46 +03:00
func BenchmarkClosureCall(b *testing.B) {
sum := 0
off1 := 1
for i := 0; i < b.N; i++ {
off2 := 2
func() {
sum += i + off1 + off2
}()
}
_ = sum
}
runtime: only sleep before stealing work from a running P The sleep in question does not make sense if the stolen-from P cannot run the stolen G. The usleep(3) has been observed delaying execution of woken G's by ~60us; skipping it reduces the wakeup-to-execution latency to ~7us in these cases, improving CPU utilization. Benchmarks added by this change: name old time/op new time/op delta WakeupParallelSpinning/0s-12 14.4µs ± 1% 14.3µs ± 1% ~ (p=0.227 n=19+20) WakeupParallelSpinning/1µs-12 18.3µs ± 0% 18.3µs ± 1% ~ (p=0.950 n=20+19) WakeupParallelSpinning/2µs-12 22.3µs ± 1% 22.3µs ± 1% ~ (p=0.670 n=20+18) WakeupParallelSpinning/5µs-12 31.7µs ± 0% 31.7µs ± 0% ~ (p=0.460 n=20+17) WakeupParallelSpinning/10µs-12 51.8µs ± 0% 51.8µs ± 0% ~ (p=0.883 n=20+20) WakeupParallelSpinning/20µs-12 91.9µs ± 0% 91.9µs ± 0% ~ (p=0.245 n=20+20) WakeupParallelSpinning/50µs-12 214µs ± 0% 214µs ± 0% ~ (p=0.509 n=19+20) WakeupParallelSpinning/100µs-12 335µs ± 0% 335µs ± 0% -0.05% (p=0.006 n=17+15) WakeupParallelSyscall/0s-12 228µs ± 2% 129µs ± 1% -43.32% (p=0.000 n=20+19) WakeupParallelSyscall/1µs-12 232µs ± 1% 131µs ± 1% -43.60% (p=0.000 n=19+20) WakeupParallelSyscall/2µs-12 236µs ± 1% 133µs ± 1% -43.44% (p=0.000 n=18+19) WakeupParallelSyscall/5µs-12 248µs ± 2% 139µs ± 1% -43.68% (p=0.000 n=18+19) WakeupParallelSyscall/10µs-12 263µs ± 3% 150µs ± 2% -42.97% (p=0.000 n=18+20) WakeupParallelSyscall/20µs-12 281µs ± 2% 170µs ± 1% -39.43% (p=0.000 n=19+19) WakeupParallelSyscall/50µs-12 345µs ± 4% 246µs ± 7% -28.85% (p=0.000 n=20+20) WakeupParallelSyscall/100µs-12 460µs ± 5% 350µs ± 4% -23.85% (p=0.000 n=20+20) Benchmarks associated with the change that originally added this sleep (see https://golang.org/s/go15gomaxprocs): name old time/op new time/op delta Chain 19.4µs ± 2% 19.3µs ± 1% ~ (p=0.101 n=19+20) ChainBuf 19.5µs ± 2% 19.4µs ± 2% ~ (p=0.840 n=19+19) Chain-2 19.9µs ± 1% 19.9µs ± 2% ~ (p=0.734 n=19+19) ChainBuf-2 20.0µs ± 2% 20.0µs ± 2% ~ (p=0.175 n=19+17) Chain-4 20.3µs ± 1% 20.1µs ± 1% -0.62% (p=0.010 n=19+18) ChainBuf-4 20.3µs ± 1% 20.2µs ± 1% -0.52% (p=0.023 n=19+19) Powser 2.09s ± 1% 2.10s ± 3% ~ (p=0.908 n=19+19) Powser-2 2.21s ± 1% 2.20s ± 1% -0.35% (p=0.010 n=19+18) Powser-4 2.31s ± 2% 2.31s ± 2% ~ (p=0.578 n=18+19) Sieve 13.6s ± 1% 13.6s ± 1% ~ (p=0.909 n=17+18) Sieve-2 8.02s ±52% 7.28s ±15% ~ (p=0.336 n=20+16) Sieve-4 4.00s ±35% 3.98s ±26% ~ (p=0.654 n=20+18) Change-Id: I58edd8ce01075859d871e2348fc0833e9c01f70f Reviewed-on: https://go-review.googlesource.com/78538 Reviewed-by: Austin Clements <austin@google.com>
2017-11-15 12:47:22 -08:00
func benchmarkWakeupParallel(b *testing.B, spin func(time.Duration)) {
if runtime.GOMAXPROCS(0) == 1 {
b.Skip("skipping: GOMAXPROCS=1")
}
wakeDelay := 5 * time.Microsecond
for _, delay := range []time.Duration{
0,
1 * time.Microsecond,
2 * time.Microsecond,
5 * time.Microsecond,
10 * time.Microsecond,
20 * time.Microsecond,
50 * time.Microsecond,
100 * time.Microsecond,
} {
b.Run(delay.String(), func(b *testing.B) {
if b.N == 0 {
return
}
// Start two goroutines, which alternate between being
// sender and receiver in the following protocol:
//
// - The receiver spins for `delay` and then does a
// blocking receive on a channel.
//
// - The sender spins for `delay+wakeDelay` and then
// sends to the same channel. (The addition of
// `wakeDelay` improves the probability that the
// receiver will be blocking when the send occurs when
// the goroutines execute in parallel.)
//
// In each iteration of the benchmark, each goroutine
// acts once as sender and once as receiver, so each
// goroutine spins for delay twice.
//
// BenchmarkWakeupParallel is used to estimate how
// efficiently the scheduler parallelizes goroutines in
// the presence of blocking:
//
// - If both goroutines are executed on the same core,
// an increase in delay by N will increase the time per
// iteration by 4*N, because all 4 delays are
// serialized.
//
// - Otherwise, an increase in delay by N will increase
// the time per iteration by 2*N, and the time per
// iteration is 2 * (runtime overhead + chan
// send/receive pair + delay + wakeDelay). This allows
// the runtime overhead, including the time it takes
// for the unblocked goroutine to be scheduled, to be
// estimated.
ping, pong := make(chan struct{}), make(chan struct{})
start := make(chan struct{})
done := make(chan struct{})
go func() {
<-start
for i := 0; i < b.N; i++ {
// sender
spin(delay + wakeDelay)
ping <- struct{}{}
// receiver
spin(delay)
<-pong
}
done <- struct{}{}
}()
go func() {
for i := 0; i < b.N; i++ {
// receiver
spin(delay)
<-ping
// sender
spin(delay + wakeDelay)
pong <- struct{}{}
}
done <- struct{}{}
}()
b.ResetTimer()
start <- struct{}{}
<-done
<-done
})
}
}
func BenchmarkWakeupParallelSpinning(b *testing.B) {
benchmarkWakeupParallel(b, func(d time.Duration) {
end := time.Now().Add(d)
for time.Now().Before(end) {
// do nothing
}
})
}
// sysNanosleep is defined by OS-specific files (such as runtime_linux_test.go)
// to sleep for the given duration. If nil, dependent tests are skipped.
// The implementation should invoke a blocking system call and not
// call time.Sleep, which would deschedule the goroutine.
var sysNanosleep func(d time.Duration)
runtime: only sleep before stealing work from a running P The sleep in question does not make sense if the stolen-from P cannot run the stolen G. The usleep(3) has been observed delaying execution of woken G's by ~60us; skipping it reduces the wakeup-to-execution latency to ~7us in these cases, improving CPU utilization. Benchmarks added by this change: name old time/op new time/op delta WakeupParallelSpinning/0s-12 14.4µs ± 1% 14.3µs ± 1% ~ (p=0.227 n=19+20) WakeupParallelSpinning/1µs-12 18.3µs ± 0% 18.3µs ± 1% ~ (p=0.950 n=20+19) WakeupParallelSpinning/2µs-12 22.3µs ± 1% 22.3µs ± 1% ~ (p=0.670 n=20+18) WakeupParallelSpinning/5µs-12 31.7µs ± 0% 31.7µs ± 0% ~ (p=0.460 n=20+17) WakeupParallelSpinning/10µs-12 51.8µs ± 0% 51.8µs ± 0% ~ (p=0.883 n=20+20) WakeupParallelSpinning/20µs-12 91.9µs ± 0% 91.9µs ± 0% ~ (p=0.245 n=20+20) WakeupParallelSpinning/50µs-12 214µs ± 0% 214µs ± 0% ~ (p=0.509 n=19+20) WakeupParallelSpinning/100µs-12 335µs ± 0% 335µs ± 0% -0.05% (p=0.006 n=17+15) WakeupParallelSyscall/0s-12 228µs ± 2% 129µs ± 1% -43.32% (p=0.000 n=20+19) WakeupParallelSyscall/1µs-12 232µs ± 1% 131µs ± 1% -43.60% (p=0.000 n=19+20) WakeupParallelSyscall/2µs-12 236µs ± 1% 133µs ± 1% -43.44% (p=0.000 n=18+19) WakeupParallelSyscall/5µs-12 248µs ± 2% 139µs ± 1% -43.68% (p=0.000 n=18+19) WakeupParallelSyscall/10µs-12 263µs ± 3% 150µs ± 2% -42.97% (p=0.000 n=18+20) WakeupParallelSyscall/20µs-12 281µs ± 2% 170µs ± 1% -39.43% (p=0.000 n=19+19) WakeupParallelSyscall/50µs-12 345µs ± 4% 246µs ± 7% -28.85% (p=0.000 n=20+20) WakeupParallelSyscall/100µs-12 460µs ± 5% 350µs ± 4% -23.85% (p=0.000 n=20+20) Benchmarks associated with the change that originally added this sleep (see https://golang.org/s/go15gomaxprocs): name old time/op new time/op delta Chain 19.4µs ± 2% 19.3µs ± 1% ~ (p=0.101 n=19+20) ChainBuf 19.5µs ± 2% 19.4µs ± 2% ~ (p=0.840 n=19+19) Chain-2 19.9µs ± 1% 19.9µs ± 2% ~ (p=0.734 n=19+19) ChainBuf-2 20.0µs ± 2% 20.0µs ± 2% ~ (p=0.175 n=19+17) Chain-4 20.3µs ± 1% 20.1µs ± 1% -0.62% (p=0.010 n=19+18) ChainBuf-4 20.3µs ± 1% 20.2µs ± 1% -0.52% (p=0.023 n=19+19) Powser 2.09s ± 1% 2.10s ± 3% ~ (p=0.908 n=19+19) Powser-2 2.21s ± 1% 2.20s ± 1% -0.35% (p=0.010 n=19+18) Powser-4 2.31s ± 2% 2.31s ± 2% ~ (p=0.578 n=18+19) Sieve 13.6s ± 1% 13.6s ± 1% ~ (p=0.909 n=17+18) Sieve-2 8.02s ±52% 7.28s ±15% ~ (p=0.336 n=20+16) Sieve-4 4.00s ±35% 3.98s ±26% ~ (p=0.654 n=20+18) Change-Id: I58edd8ce01075859d871e2348fc0833e9c01f70f Reviewed-on: https://go-review.googlesource.com/78538 Reviewed-by: Austin Clements <austin@google.com>
2017-11-15 12:47:22 -08:00
func BenchmarkWakeupParallelSyscall(b *testing.B) {
if sysNanosleep == nil {
b.Skipf("skipping on %v; sysNanosleep not defined", runtime.GOOS)
}
runtime: only sleep before stealing work from a running P The sleep in question does not make sense if the stolen-from P cannot run the stolen G. The usleep(3) has been observed delaying execution of woken G's by ~60us; skipping it reduces the wakeup-to-execution latency to ~7us in these cases, improving CPU utilization. Benchmarks added by this change: name old time/op new time/op delta WakeupParallelSpinning/0s-12 14.4µs ± 1% 14.3µs ± 1% ~ (p=0.227 n=19+20) WakeupParallelSpinning/1µs-12 18.3µs ± 0% 18.3µs ± 1% ~ (p=0.950 n=20+19) WakeupParallelSpinning/2µs-12 22.3µs ± 1% 22.3µs ± 1% ~ (p=0.670 n=20+18) WakeupParallelSpinning/5µs-12 31.7µs ± 0% 31.7µs ± 0% ~ (p=0.460 n=20+17) WakeupParallelSpinning/10µs-12 51.8µs ± 0% 51.8µs ± 0% ~ (p=0.883 n=20+20) WakeupParallelSpinning/20µs-12 91.9µs ± 0% 91.9µs ± 0% ~ (p=0.245 n=20+20) WakeupParallelSpinning/50µs-12 214µs ± 0% 214µs ± 0% ~ (p=0.509 n=19+20) WakeupParallelSpinning/100µs-12 335µs ± 0% 335µs ± 0% -0.05% (p=0.006 n=17+15) WakeupParallelSyscall/0s-12 228µs ± 2% 129µs ± 1% -43.32% (p=0.000 n=20+19) WakeupParallelSyscall/1µs-12 232µs ± 1% 131µs ± 1% -43.60% (p=0.000 n=19+20) WakeupParallelSyscall/2µs-12 236µs ± 1% 133µs ± 1% -43.44% (p=0.000 n=18+19) WakeupParallelSyscall/5µs-12 248µs ± 2% 139µs ± 1% -43.68% (p=0.000 n=18+19) WakeupParallelSyscall/10µs-12 263µs ± 3% 150µs ± 2% -42.97% (p=0.000 n=18+20) WakeupParallelSyscall/20µs-12 281µs ± 2% 170µs ± 1% -39.43% (p=0.000 n=19+19) WakeupParallelSyscall/50µs-12 345µs ± 4% 246µs ± 7% -28.85% (p=0.000 n=20+20) WakeupParallelSyscall/100µs-12 460µs ± 5% 350µs ± 4% -23.85% (p=0.000 n=20+20) Benchmarks associated with the change that originally added this sleep (see https://golang.org/s/go15gomaxprocs): name old time/op new time/op delta Chain 19.4µs ± 2% 19.3µs ± 1% ~ (p=0.101 n=19+20) ChainBuf 19.5µs ± 2% 19.4µs ± 2% ~ (p=0.840 n=19+19) Chain-2 19.9µs ± 1% 19.9µs ± 2% ~ (p=0.734 n=19+19) ChainBuf-2 20.0µs ± 2% 20.0µs ± 2% ~ (p=0.175 n=19+17) Chain-4 20.3µs ± 1% 20.1µs ± 1% -0.62% (p=0.010 n=19+18) ChainBuf-4 20.3µs ± 1% 20.2µs ± 1% -0.52% (p=0.023 n=19+19) Powser 2.09s ± 1% 2.10s ± 3% ~ (p=0.908 n=19+19) Powser-2 2.21s ± 1% 2.20s ± 1% -0.35% (p=0.010 n=19+18) Powser-4 2.31s ± 2% 2.31s ± 2% ~ (p=0.578 n=18+19) Sieve 13.6s ± 1% 13.6s ± 1% ~ (p=0.909 n=17+18) Sieve-2 8.02s ±52% 7.28s ±15% ~ (p=0.336 n=20+16) Sieve-4 4.00s ±35% 3.98s ±26% ~ (p=0.654 n=20+18) Change-Id: I58edd8ce01075859d871e2348fc0833e9c01f70f Reviewed-on: https://go-review.googlesource.com/78538 Reviewed-by: Austin Clements <austin@google.com>
2017-11-15 12:47:22 -08:00
benchmarkWakeupParallel(b, func(d time.Duration) {
sysNanosleep(d)
runtime: only sleep before stealing work from a running P The sleep in question does not make sense if the stolen-from P cannot run the stolen G. The usleep(3) has been observed delaying execution of woken G's by ~60us; skipping it reduces the wakeup-to-execution latency to ~7us in these cases, improving CPU utilization. Benchmarks added by this change: name old time/op new time/op delta WakeupParallelSpinning/0s-12 14.4µs ± 1% 14.3µs ± 1% ~ (p=0.227 n=19+20) WakeupParallelSpinning/1µs-12 18.3µs ± 0% 18.3µs ± 1% ~ (p=0.950 n=20+19) WakeupParallelSpinning/2µs-12 22.3µs ± 1% 22.3µs ± 1% ~ (p=0.670 n=20+18) WakeupParallelSpinning/5µs-12 31.7µs ± 0% 31.7µs ± 0% ~ (p=0.460 n=20+17) WakeupParallelSpinning/10µs-12 51.8µs ± 0% 51.8µs ± 0% ~ (p=0.883 n=20+20) WakeupParallelSpinning/20µs-12 91.9µs ± 0% 91.9µs ± 0% ~ (p=0.245 n=20+20) WakeupParallelSpinning/50µs-12 214µs ± 0% 214µs ± 0% ~ (p=0.509 n=19+20) WakeupParallelSpinning/100µs-12 335µs ± 0% 335µs ± 0% -0.05% (p=0.006 n=17+15) WakeupParallelSyscall/0s-12 228µs ± 2% 129µs ± 1% -43.32% (p=0.000 n=20+19) WakeupParallelSyscall/1µs-12 232µs ± 1% 131µs ± 1% -43.60% (p=0.000 n=19+20) WakeupParallelSyscall/2µs-12 236µs ± 1% 133µs ± 1% -43.44% (p=0.000 n=18+19) WakeupParallelSyscall/5µs-12 248µs ± 2% 139µs ± 1% -43.68% (p=0.000 n=18+19) WakeupParallelSyscall/10µs-12 263µs ± 3% 150µs ± 2% -42.97% (p=0.000 n=18+20) WakeupParallelSyscall/20µs-12 281µs ± 2% 170µs ± 1% -39.43% (p=0.000 n=19+19) WakeupParallelSyscall/50µs-12 345µs ± 4% 246µs ± 7% -28.85% (p=0.000 n=20+20) WakeupParallelSyscall/100µs-12 460µs ± 5% 350µs ± 4% -23.85% (p=0.000 n=20+20) Benchmarks associated with the change that originally added this sleep (see https://golang.org/s/go15gomaxprocs): name old time/op new time/op delta Chain 19.4µs ± 2% 19.3µs ± 1% ~ (p=0.101 n=19+20) ChainBuf 19.5µs ± 2% 19.4µs ± 2% ~ (p=0.840 n=19+19) Chain-2 19.9µs ± 1% 19.9µs ± 2% ~ (p=0.734 n=19+19) ChainBuf-2 20.0µs ± 2% 20.0µs ± 2% ~ (p=0.175 n=19+17) Chain-4 20.3µs ± 1% 20.1µs ± 1% -0.62% (p=0.010 n=19+18) ChainBuf-4 20.3µs ± 1% 20.2µs ± 1% -0.52% (p=0.023 n=19+19) Powser 2.09s ± 1% 2.10s ± 3% ~ (p=0.908 n=19+19) Powser-2 2.21s ± 1% 2.20s ± 1% -0.35% (p=0.010 n=19+18) Powser-4 2.31s ± 2% 2.31s ± 2% ~ (p=0.578 n=18+19) Sieve 13.6s ± 1% 13.6s ± 1% ~ (p=0.909 n=17+18) Sieve-2 8.02s ±52% 7.28s ±15% ~ (p=0.336 n=20+16) Sieve-4 4.00s ±35% 3.98s ±26% ~ (p=0.654 n=20+18) Change-Id: I58edd8ce01075859d871e2348fc0833e9c01f70f Reviewed-on: https://go-review.googlesource.com/78538 Reviewed-by: Austin Clements <austin@google.com>
2017-11-15 12:47:22 -08:00
})
}
type Matrix [][]float64
func BenchmarkMatmult(b *testing.B) {
b.StopTimer()
// matmult is O(N**3) but testing expects O(b.N),
// so we need to take cube root of b.N
n := int(math.Cbrt(float64(b.N))) + 1
A := makeMatrix(n)
B := makeMatrix(n)
C := makeMatrix(n)
b.StartTimer()
matmult(nil, A, B, C, 0, n, 0, n, 0, n, 8)
}
func makeMatrix(n int) Matrix {
m := make(Matrix, n)
for i := 0; i < n; i++ {
m[i] = make([]float64, n)
for j := 0; j < n; j++ {
m[i][j] = float64(i*n + j)
}
}
return m
}
func matmult(done chan<- struct{}, A, B, C Matrix, i0, i1, j0, j1, k0, k1, threshold int) {
di := i1 - i0
dj := j1 - j0
dk := k1 - k0
if di >= dj && di >= dk && di >= threshold {
// divide in two by y axis
mi := i0 + di/2
done1 := make(chan struct{}, 1)
go matmult(done1, A, B, C, i0, mi, j0, j1, k0, k1, threshold)
matmult(nil, A, B, C, mi, i1, j0, j1, k0, k1, threshold)
<-done1
} else if dj >= dk && dj >= threshold {
// divide in two by x axis
mj := j0 + dj/2
done1 := make(chan struct{}, 1)
go matmult(done1, A, B, C, i0, i1, j0, mj, k0, k1, threshold)
matmult(nil, A, B, C, i0, i1, mj, j1, k0, k1, threshold)
<-done1
} else if dk >= threshold {
// divide in two by "k" axis
// deliberately not parallel because of data races
mk := k0 + dk/2
matmult(nil, A, B, C, i0, i1, j0, j1, k0, mk, threshold)
matmult(nil, A, B, C, i0, i1, j0, j1, mk, k1, threshold)
} else {
// the matrices are small enough, compute directly
for i := i0; i < i1; i++ {
for j := j0; j < j1; j++ {
for k := k0; k < k1; k++ {
C[i][j] += A[i][k] * B[k][j]
}
}
}
}
if done != nil {
done <- struct{}{}
}
}
runtime: improve randomized stealing logic During random stealing we steal 4*GOMAXPROCS times from random procs. One would expect that most of the time we check all procs this way, but due to low quality PRNG we actually miss procs with frightening probability. Below are modelling experiment results for 1e6 tries: GOMAXPROCS = 2 : missed 1 procs 7944 times GOMAXPROCS = 3 : missed 1 procs 101620 times GOMAXPROCS = 3 : missed 2 procs 3571 times GOMAXPROCS = 4 : missed 1 procs 63916 times GOMAXPROCS = 4 : missed 2 procs 61 times GOMAXPROCS = 4 : missed 3 procs 16 times GOMAXPROCS = 5 : missed 1 procs 133136 times GOMAXPROCS = 5 : missed 2 procs 1025 times GOMAXPROCS = 5 : missed 3 procs 101 times GOMAXPROCS = 5 : missed 4 procs 15 times GOMAXPROCS = 8 : missed 1 procs 151765 times GOMAXPROCS = 8 : missed 2 procs 5057 times GOMAXPROCS = 8 : missed 3 procs 1726 times GOMAXPROCS = 8 : missed 4 procs 68 times GOMAXPROCS = 12 : missed 1 procs 199081 times GOMAXPROCS = 12 : missed 2 procs 27489 times GOMAXPROCS = 12 : missed 3 procs 3113 times GOMAXPROCS = 12 : missed 4 procs 233 times GOMAXPROCS = 12 : missed 5 procs 9 times GOMAXPROCS = 16 : missed 1 procs 237477 times GOMAXPROCS = 16 : missed 2 procs 30037 times GOMAXPROCS = 16 : missed 3 procs 9466 times GOMAXPROCS = 16 : missed 4 procs 1334 times GOMAXPROCS = 16 : missed 5 procs 192 times GOMAXPROCS = 16 : missed 6 procs 5 times GOMAXPROCS = 16 : missed 7 procs 1 times GOMAXPROCS = 16 : missed 8 procs 1 times A missed proc won't lead to underutilization because we check all procs again after dropping P. But it can lead to an unpleasant situation when we miss a proc, drop P, check all procs, discover work, acquire P, miss the proc again, repeat. Improve stealing logic to cover all procs. Also don't enter spinning mode and try to steal when there is nobody around. Change-Id: Ibb6b122cc7fb836991bad7d0639b77c807aab4c2 Reviewed-on: https://go-review.googlesource.com/20836 Reviewed-by: Rick Hudson <rlh@golang.org> Run-TryBot: Dmitry Vyukov <dvyukov@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Marvin Stenger <marvin.stenger94@gmail.com>
2016-03-18 12:52:52 +01:00
func TestStealOrder(t *testing.T) {
runtime.RunStealOrderTest()
}
func TestLockOSThreadNesting(t *testing.T) {
if runtime.GOARCH == "wasm" {
t.Skip("no threads on wasm yet")
}
go func() {
e, i := runtime.LockOSCounts()
if e != 0 || i != 0 {
t.Errorf("want locked counts 0, 0; got %d, %d", e, i)
return
}
runtime.LockOSThread()
runtime.LockOSThread()
runtime.UnlockOSThread()
e, i = runtime.LockOSCounts()
if e != 1 || i != 0 {
t.Errorf("want locked counts 1, 0; got %d, %d", e, i)
return
}
runtime.UnlockOSThread()
e, i = runtime.LockOSCounts()
if e != 0 || i != 0 {
t.Errorf("want locked counts 0, 0; got %d, %d", e, i)
return
}
}()
}
func TestLockOSThreadExit(t *testing.T) {
testLockOSThreadExit(t, "testprog")
}
func testLockOSThreadExit(t *testing.T, prog string) {
output := runTestProg(t, prog, "LockOSThreadMain", "GOMAXPROCS=1")
want := "OK\n"
if output != want {
t.Errorf("want %q, got %q", want, output)
}
output = runTestProg(t, prog, "LockOSThreadAlt")
if output != want {
t.Errorf("want %q, got %q", want, output)
}
}
func TestLockOSThreadAvoidsStatePropagation(t *testing.T) {
want := "OK\n"
skip := "unshare not permitted\n"
output := runTestProg(t, "testprog", "LockOSThreadAvoidsStatePropagation", "GOMAXPROCS=1")
if output == skip {
t.Skip("unshare syscall not permitted on this system")
} else if output != want {
t.Errorf("want %q, got %q", want, output)
}
}
runtime: disable preemption in startTemplateThread When a locked M wants to start a new M, it hands off to the template thread to actually call clone and start the thread. The template thread is lazily created the first time a thread is locked (or if cgo is in use). stoplockedm will release the P (_Pidle), then call handoffp to give the P to another M. In the case of a pending STW, one of two things can happen: 1. handoffp starts an M, which does acquirep followed by schedule, which will finally enter _Pgcstop. 2. handoffp immediately enters _Pgcstop. This only occurs if the P has no local work, GC work, and no spinning M is required. If handoffp starts an M, and must create a new M to do so, then newm will simply queue the M on newmHandoff for the template thread to do the clone. When a stop-the-world is required, stopTheWorldWithSema will start the stop and then wait for all Ps to enter _Pgcstop. If the template thread is not fully created because startTemplateThread gets stopped, then another stoplockedm may queue an M that will never get created, and the handoff P will never leave _Pidle. Thus stopTheWorldWithSema will wait forever. A sequence to trigger this hang when STW occurs can be visualized with two threads: T1 T2 ------------------------------- ----------------------------- LockOSThread LockOSThread haveTemplateThread == 0 startTemplateThread haveTemplateThread = 1 newm haveTemplateThread == 1 preempt -> schedule g.m.lockedExt++ gcstopm -> _Pgcstop g.m.lockedg = ... park g.lockedm = ... return ... (any code) preempt -> schedule stoplockedm releasep -> _Pidle handoffp startm (first 3 handoffp cases) newm g.m.lockedExt != 0 Add to newmHandoff, return park Note that the P in T2 is stuck sitting in _Pidle. Since the template thread isn't running, the new M will not be started complete the transition to _Pgcstop. To resolve this, we disable preemption around the assignment of haveTemplateThread and the creation of the template thread in order to guarantee that if handTemplateThread is set then the template thread will eventually exist, in the presence of stops. Fixes #38931 Change-Id: I50535fbbe2f328f47b18e24d9030136719274191 Reviewed-on: https://go-review.googlesource.com/c/go/+/232978 Run-TryBot: Michael Pratt <mpratt@google.com> Reviewed-by: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2020-05-07 18:13:21 -04:00
func TestLockOSThreadTemplateThreadRace(t *testing.T) {
testenv.MustHaveGoRun(t)
exe, err := buildTestProg(t, "testprog")
if err != nil {
t.Fatal(err)
}
iterations := 100
if testing.Short() {
// Reduce run time to ~100ms, with much lower probability of
// catching issues.
iterations = 5
}
for i := 0; i < iterations; i++ {
want := "OK\n"
output := runBuiltTestProg(t, exe, "LockOSThreadTemplateThreadRace")
if output != want {
t.Fatalf("run %d: want %q, got %q", i, want, output)
}
}
}
// fakeSyscall emulates a system call.
//go:nosplit
func fakeSyscall(duration time.Duration) {
runtime.Entersyscall()
for start := runtime.Nanotime(); runtime.Nanotime()-start < int64(duration); {
}
runtime.Exitsyscall()
}
// Check that a goroutine will be preempted if it is calling short system calls.
func testPreemptionAfterSyscall(t *testing.T, syscallDuration time.Duration) {
if runtime.GOARCH == "wasm" {
t.Skip("no preemption on wasm yet")
}
defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
interations := 10
if testing.Short() {
interations = 1
}
const (
maxDuration = 3 * time.Second
nroutines = 8
)
for i := 0; i < interations; i++ {
c := make(chan bool, nroutines)
stop := uint32(0)
start := time.Now()
for g := 0; g < nroutines; g++ {
go func(stop *uint32) {
c <- true
for atomic.LoadUint32(stop) == 0 {
fakeSyscall(syscallDuration)
}
c <- true
}(&stop)
}
// wait until all goroutines have started.
for g := 0; g < nroutines; g++ {
<-c
}
atomic.StoreUint32(&stop, 1)
// wait until all goroutines have finished.
for g := 0; g < nroutines; g++ {
<-c
}
duration := time.Since(start)
if duration > maxDuration {
t.Errorf("timeout exceeded: %v (%v)", duration, maxDuration)
}
}
}
func TestPreemptionAfterSyscall(t *testing.T) {
for _, i := range []time.Duration{10, 100, 1000} {
d := i * time.Microsecond
t.Run(fmt.Sprint(d), func(t *testing.T) {
testPreemptionAfterSyscall(t, d)
})
}
}
func TestGetgThreadSwitch(t *testing.T) {
runtime.RunGetgThreadSwitchTest()
}
// TestNetpollBreak tests that netpollBreak can break a netpoll.
// This test is not particularly safe since the call to netpoll
// will pick up any stray files that are ready, but it should work
// OK as long it is not run in parallel.
func TestNetpollBreak(t *testing.T) {
if runtime.GOMAXPROCS(0) == 1 {
t.Skip("skipping: GOMAXPROCS=1")
}
// Make sure that netpoll is initialized.
runtime.NetpollGenericInit()
start := time.Now()
c := make(chan bool, 2)
go func() {
c <- true
runtime.Netpoll(10 * time.Second.Nanoseconds())
c <- true
}()
<-c
// Loop because the break might get eaten by the scheduler.
// Break twice to break both the netpoll we started and the
// scheduler netpoll.
loop:
for {
runtime.Usleep(100)
runtime.NetpollBreak()
runtime.NetpollBreak()
select {
case <-c:
break loop
default:
}
}
if dur := time.Since(start); dur > 5*time.Second {
t.Errorf("netpollBreak did not interrupt netpoll: slept for: %v", dur)
}
}
// TestBigGOMAXPROCS tests that setting GOMAXPROCS to a large value
// doesn't cause a crash at startup. See issue 38474.
func TestBigGOMAXPROCS(t *testing.T) {
t.Parallel()
output := runTestProg(t, "testprog", "NonexistentTest", "GOMAXPROCS=1024")
// Ignore error conditions on small machines.
for _, errstr := range []string{
"failed to create new OS thread",
"cannot allocate memory",
} {
if strings.Contains(output, errstr) {
t.Skipf("failed to create 1024 threads")
}
}
if !strings.Contains(output, "unknown function: NonexistentTest") {
t.Errorf("output:\n%s\nwanted:\nunknown function: NonexistentTest", output)
}
}